From af36384a45fd9514e23e1d6e521e782e0d32011c Mon Sep 17 00:00:00 2001
From: Abhijeet Prasad <abhijeet@braintrustdata.com>
Date: Wed, 18 Mar 2026 22:51:28 -0400
Subject: [PATCH 01/12] fix: correct span tracking for concurrent subagents in
 Claude Agent SDK wrapper
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three bugs surfaced when multiple subagents ran concurrently on the single
interleaved message stream that the Claude Agent SDK produces.

## Bug 1: cleanup() prematurely ended tool spans from other subagents

cleanup() was global — when any subagent's AssistantMessage arrived, it
force-ended ALL active tool spans, including those belonging to other
subagents that had not yet received their ToolResultBlock. This caused tool
output to be silently dropped.

Fix: add an only_parent_tool_use_id parameter to cleanup(). It now filters
by the parent_tool_use_id of the incoming AssistantMessage, leaving tool
spans from other subagent contexts untouched.

## Bug 2: LLMSpanTracker had a single shared state

LLMSpanTracker kept a single current_span, current_output, next_start_time,
etc. When subagent B's LLM turn started, it ended subagent A's span early,
serializing what should have been parallel LLM spans into a sequence and
misrouting tool span parentage.

Fix: refactor LLMSpanTracker to maintain a per-subagent _SubagentState dict
keyed by parent_tool_use_id. A set_context() call on each incoming
AssistantMessage routes all operations (start, end, log, timing) to the
correct subagent's independent state. The orchestrator context
(parent_tool_use_id=None) is seeded at init with the query start time.

## Bug 3: identical concurrent tool calls caused span swapping

When two sibling subagents called the same tool with the same arguments,
acquire_span_for_handler() matched purely by name + input and could not
distinguish which span belonged to which handler invocation, leading to
nested child spans being parented under the wrong tool span.

Fix: add a _dispatch_queues dict (FIFO queue per (tool_name, input_sig)) to
ToolSpanTracker. Spans are enqueued on creation and dequeued in order on
acquire, guaranteeing the first handler invocation gets the first span.
---
 .../wrappers/claude_agent_sdk/_wrapper.py     | 192 ++++--
 ...llel_llm_spans_with_correct_parenting.json | 550 ++++++++++++++++++
 ...leaved_subagent_tool_output_preserved.json | 340 +++++++++++
 .../wrappers/claude_agent_sdk/test_wrapper.py | 533 +++++++++++++++++
 4 files changed, 1579 insertions(+), 36 deletions(-)
 create mode 100644 py/src/braintrust/wrappers/claude_agent_sdk/cassettes/test_concurrent_subagents_produce_parallel_llm_spans_with_correct_parenting.json
 create mode 100644 py/src/braintrust/wrappers/claude_agent_sdk/cassettes/test_interleaved_subagent_tool_output_preserved.json

diff --git a/py/src/braintrust/wrappers/claude_agent_sdk/_wrapper.py b/py/src/braintrust/wrappers/claude_agent_sdk/_wrapper.py
index e019241d..e6e3d901 100644
--- a/py/src/braintrust/wrappers/claude_agent_sdk/_wrapper.py
+++ b/py/src/braintrust/wrappers/claude_agent_sdk/_wrapper.py
@@ -1,5 +1,7 @@
 import asyncio
+import collections
 import dataclasses
+import json
 import logging
 import threading
 import time
@@ -37,12 +39,18 @@ class ParsedToolName:
     mcp_server: str | None = None
 
 
+_UNSET_PARENT = object()
+"""Sentinel to distinguish 'no filter' from 'filter to orchestrator (None)'."""
+
+
 @dataclasses.dataclass
 class _ActiveToolSpan:
     span: Any
     raw_name: str
     display_name: str
     input: Any
+    tool_use_id: str | None = None
+    parent_tool_use_id: str | None = None
     handler_active: bool = False
 
     @property
@@ -236,15 +244,30 @@ async def wrapped_handler(args: Any) -> Any:
     return wrapped_handler
 
 
+def _make_dispatch_key(tool_name: str, tool_input: Any) -> tuple[str, str]:
+    """Create a hashable key for dispatch queue lookup from tool name and input."""
+    try:
+        input_sig = json.dumps(tool_input, sort_keys=True, default=str)
+    except (TypeError, ValueError):
+        input_sig = repr(tool_input)
+    return (tool_name, input_sig)
+
+
 class ToolSpanTracker:
     def __init__(self):
         self._active_spans: dict[str, _ActiveToolSpan] = {}
         self._pending_task_link_tool_use_ids: set[str] = set()
+        # Per-(tool_name, input_signature) FIFO queue of tool_use_ids.
+        # Used by acquire_span_for_handler to disambiguate identical concurrent
+        # tool calls (same name + same input) from sibling subagents.
+        self._dispatch_queues: dict[tuple[str, str], collections.deque[str]] = {}
 
     def start_tool_spans(self, message: Any, llm_span_export: str | None) -> None:
         if llm_span_export is None or not hasattr(message, "content"):
             return
 
+        message_parent_tool_use_id = getattr(message, "parent_tool_use_id", None)
+
         for block in message.content:
             if type(block).__name__ != BlockClassName.TOOL_USE:
                 continue
@@ -277,12 +300,17 @@ def start_tool_spans(self, message: Any, llm_span_export: str | None) -> None:
                 metadata=metadata,
                 parent=llm_span_export,
             )
+            tool_input = getattr(block, "input", None)
             self._active_spans[tool_use_id] = _ActiveToolSpan(
                 span=tool_span,
                 raw_name=parsed_tool_name.raw_name,
                 display_name=parsed_tool_name.display_name,
-                input=getattr(block, "input", None),
+                input=tool_input,
+                tool_use_id=tool_use_id,
+                parent_tool_use_id=message_parent_tool_use_id,
             )
+            dispatch_key = _make_dispatch_key(parsed_tool_name.raw_name, tool_input)
+            self._dispatch_queues.setdefault(dispatch_key, collections.deque()).append(tool_use_id)
             if parsed_tool_name.display_name == "Agent":
                 self._pending_task_link_tool_use_ids.add(tool_use_id)
 
@@ -300,10 +328,19 @@ def finish_tool_spans(self, message: Any) -> None:
 
             self._end_tool_span(str(tool_use_id), tool_result_block=block)
 
-    def cleanup(self, end_time: float | None = None, exclude_tool_use_ids: frozenset[str] | None = None) -> None:
+    def cleanup(
+        self,
+        end_time: float | None = None,
+        exclude_tool_use_ids: frozenset[str] | None = None,
+        only_parent_tool_use_id: Any = _UNSET_PARENT,
+    ) -> None:
         for tool_use_id in list(self._active_spans):
             if exclude_tool_use_ids and tool_use_id in exclude_tool_use_ids:
                 continue
+            if only_parent_tool_use_id is not _UNSET_PARENT:
+                active = self._active_spans.get(tool_use_id)
+                if active is not None and active.parent_tool_use_id != only_parent_tool_use_id:
+                    continue
             self._end_tool_span(tool_use_id, end_time=end_time)
 
     @property
@@ -333,13 +370,37 @@ def acquire_span_for_handler(self, tool_name: Any, args: Any) -> _ActiveToolSpan
             and (active_tool_span.raw_name in candidate_names or active_tool_span.display_name in candidate_names)
         ]
 
-        matched_span = _match_tool_span_for_handler(candidates, args)
+        matched_span = self._match_via_dispatch_queue(parsed_tool_name.raw_name, args, candidates)
+        if matched_span is None:
+            matched_span = _match_tool_span_for_handler(candidates, args)
         if matched_span is None:
             return None
 
         matched_span.activate()
         return matched_span
 
+    def _match_via_dispatch_queue(
+        self, raw_name: str, args: Any, candidates: list[_ActiveToolSpan]
+    ) -> _ActiveToolSpan | None:
+        """Use the dispatch queue to match by tool_use_id when multiple identical
+        candidates exist (same name + same input from different subagents)."""
+        dispatch_key = _make_dispatch_key(raw_name, args)
+        queue = self._dispatch_queues.get(dispatch_key)
+        if not queue:
+            return None
+
+        # Pop tool_use_ids until we find one that corresponds to an available
+        # (non-handler_active) candidate, skipping stale entries.
+        candidate_ids = {c.tool_use_id for c in candidates}
+        while queue:
+            tool_use_id = queue.popleft()
+            if tool_use_id in candidate_ids:
+                for candidate in candidates:
+                    if candidate.tool_use_id == tool_use_id:
+                        return candidate
+
+        return None
+
     def _end_tool_span(
         self, tool_use_id: str, tool_result_block: Any | None = None, end_time: float | None = None
     ) -> None:
@@ -348,6 +409,17 @@ def _end_tool_span(
         if active_tool_span is None:
             return
 
+        # Remove from dispatch queue so stale entries don't accumulate.
+        dispatch_key = _make_dispatch_key(active_tool_span.raw_name, active_tool_span.input)
+        queue = self._dispatch_queues.get(dispatch_key)
+        if queue:
+            try:
+                queue.remove(tool_use_id)
+            except ValueError:
+                pass
+            if not queue:
+                del self._dispatch_queues[dispatch_key]
+
         if tool_result_block is None:
             active_tool_span.span.end(end_time=end_time)
             return
@@ -406,17 +478,49 @@ class LLMSpanTracker:
 
     We end the previous span when the next AssistantMessage arrives, using the marked
     start time to ensure sequential spans (no overlapping LLM spans).
+
+    Each subagent context (identified by parent_tool_use_id) gets its own independent
+    span state so concurrent subagents don't truncate each other's LLM spans.
     """
 
+    @dataclasses.dataclass
+    class _SubagentState:
+        current_span: Any | None = None
+        current_span_export: str | None = None
+        current_parent_export: str | None = None
+        current_output: list[dict[str, Any]] | None = None
+        next_start_time: float | None = None
+
     def __init__(self, query_start_time: float | None = None):
-        self.current_span: Any | None = None
-        self.current_span_export: str | None = None
-        self.current_parent_export: str | None = None
-        self.current_output: list[dict[str, Any]] | None = None
-        self.next_start_time: float | None = query_start_time
+        self._states: dict[str | None, LLMSpanTracker._SubagentState] = {}
+        self._active_context: str | None = None
+        # Seed the orchestrator context (parent_tool_use_id=None) with the
+        # query start time so the first orchestrator LLM span gets the right start.
+        self._states[None] = self._SubagentState(next_start_time=query_start_time)
+
+    def _get_state(self, parent_tool_use_id: str | None = _UNSET_PARENT) -> "_SubagentState":
+        key = self._active_context if parent_tool_use_id is _UNSET_PARENT else parent_tool_use_id
+        state = self._states.get(key)
+        if state is None:
+            state = self._SubagentState()
+            self._states[key] = state
+        return state
+
+    @property
+    def current_span(self) -> Any | None:
+        return self._get_state().current_span
+
+    @property
+    def current_span_export(self) -> str | None:
+        return self._get_state().current_span_export
+
+    def set_context(self, parent_tool_use_id: str | None) -> None:
+        """Set which subagent context subsequent calls operate on."""
+        self._active_context = parent_tool_use_id
 
     def get_next_start_time(self) -> float:
-        return self.next_start_time if self.next_start_time is not None else time.time()
+        state = self._get_state()
+        return state.next_start_time if state.next_start_time is not None else time.time()
 
     def start_llm_span(
         self,
@@ -426,29 +530,30 @@ def start_llm_span(
         parent_export: str | None = None,
         start_time: float | None = None,
     ) -> tuple[dict[str, Any] | None, bool]:
-        """Start a new LLM span, ending the previous one if it exists."""
+        """Start a new LLM span, ending the previous one *in the same context*."""
+        state = self._get_state()
         current_message = _serialize_assistant_message(message)
 
         if (
-            self.current_span
-            and self.next_start_time is None
-            and self.current_parent_export == parent_export
+            state.current_span
+            and state.next_start_time is None
+            and state.current_parent_export == parent_export
             and current_message is not None
         ):
             merged_message = _merge_assistant_messages(
-                self.current_output[0] if self.current_output else None,
+                state.current_output[0] if state.current_output else None,
                 current_message,
             )
             if merged_message is not None:
-                self.current_output = [merged_message]
-                self.current_span.log(output=self.current_output)
+                state.current_output = [merged_message]
+                state.current_span.log(output=state.current_output)
             return merged_message, True
 
         resolved_start_time = start_time if start_time is not None else self.get_next_start_time()
         first_token_time = time.time()
 
-        if self.current_span:
-            self.current_span.end(end_time=resolved_start_time)
+        if state.current_span:
+            state.current_span.end(end_time=resolved_start_time)
 
         final_content, span = _create_llm_span_for_messages(
             [message],
@@ -459,30 +564,40 @@ def start_llm_span(
         )
         if span is not None:
             span.log(metrics={"time_to_first_token": max(0.0, first_token_time - resolved_start_time)})
-        self.current_span = span
-        self.current_span_export = span.export() if span else None
-        self.current_parent_export = parent_export
-        self.current_output = [final_content] if final_content is not None else None
-        self.next_start_time = None
+        state.current_span = span
+        state.current_span_export = span.export() if span else None
+        state.current_parent_export = parent_export
+        state.current_output = [final_content] if final_content is not None else None
+        state.next_start_time = None
         return final_content, False
 
-    def mark_next_llm_start(self) -> None:
-        """Mark when the next LLM call will start (after tool results)."""
-        self.next_start_time = time.time()
+    def mark_next_llm_start(self, parent_tool_use_id: Any = _UNSET_PARENT) -> None:
+        """Mark when the next LLM call will start (after tool results).
+
+        When ``parent_tool_use_id`` is ``None`` (i.e. the message lacks the
+        attribute) but we have an active subagent context, fall back to the
+        active context so the timestamp lands on the correct subagent state
+        rather than the orchestrator state.
+        """
+        if parent_tool_use_id is None and self._active_context is not None:
+            parent_tool_use_id = _UNSET_PARENT
+        self._get_state(parent_tool_use_id).next_start_time = time.time()
 
     def log_usage(self, usage_metrics: dict[str, float]) -> None:
         """Log usage metrics to the current LLM span."""
-        if self.current_span and usage_metrics:
-            self.current_span.log(metrics=usage_metrics)
+        state = self._get_state()
+        if state.current_span and usage_metrics:
+            state.current_span.log(metrics=usage_metrics)
 
     def cleanup(self) -> None:
-        """End any unclosed spans."""
-        if self.current_span:
-            self.current_span.end()
-            self.current_span = None
-            self.current_span_export = None
-            self.current_parent_export = None
-            self.current_output = None
+        """End any unclosed spans across all subagent contexts."""
+        for state in self._states.values():
+            if state.current_span:
+                state.current_span.end()
+                state.current_span = None
+                state.current_span_export = None
+                state.current_parent_export = None
+                state.current_output = None
 
 
 class TaskEventSpanTracker:
@@ -714,6 +829,8 @@ async def receive_response(self) -> AsyncGenerator[Any, None]:
                         message_type = type(message).__name__
 
                         if message_type == MessageClassName.ASSISTANT:
+                            incoming_parent = getattr(message, "parent_tool_use_id", None)
+                            llm_tracker.set_context(incoming_parent)
                             if llm_tracker.current_span and tool_tracker.has_active_spans:
                                 active_subagent_tool_use_ids = (
                                     task_event_span_tracker.active_tool_use_ids
@@ -722,6 +839,7 @@ async def receive_response(self) -> AsyncGenerator[Any, None]:
                                 tool_tracker.cleanup(
                                     end_time=llm_tracker.get_next_start_time(),
                                     exclude_tool_use_ids=active_subagent_tool_use_ids,
+                                    only_parent_tool_use_id=incoming_parent,
                                 )
                             llm_parent_export = task_event_span_tracker.parent_export_for_message(
                                 message,
@@ -746,6 +864,7 @@ async def receive_response(self) -> AsyncGenerator[Any, None]:
                         elif message_type == MessageClassName.USER:
                             tool_tracker.finish_tool_spans(message)
                             has_tool_results = False
+                            user_parent = getattr(message, "parent_tool_use_id", None)
                             if hasattr(message, "content"):
                                 has_tool_results = any(
                                     type(block).__name__ == BlockClassName.TOOL_RESULT for block in message.content
@@ -753,8 +872,9 @@ async def receive_response(self) -> AsyncGenerator[Any, None]:
                                 content = _serialize_content_blocks(message.content)
                                 final_results.append({"content": content, "role": "user"})
                             if has_tool_results:
-                                llm_tracker.mark_next_llm_start()
+                                llm_tracker.mark_next_llm_start(user_parent)
                         elif message_type == MessageClassName.RESULT:
+                            llm_tracker.set_context(None)
                             if hasattr(message, "usage"):
                                 usage_metrics = _extract_usage_from_result_message(message)
                                 llm_tracker.log_usage(usage_metrics)
diff --git a/py/src/braintrust/wrappers/claude_agent_sdk/cassettes/test_concurrent_subagents_produce_parallel_llm_spans_with_correct_parenting.json b/py/src/braintrust/wrappers/claude_agent_sdk/cassettes/test_concurrent_subagents_produce_parallel_llm_spans_with_correct_parenting.json
new file mode 100644
index 00000000..c13cb624
--- /dev/null
+++ b/py/src/braintrust/wrappers/claude_agent_sdk/cassettes/test_concurrent_subagents_produce_parallel_llm_spans_with_correct_parenting.json
@@ -0,0 +1,550 @@
+{
+  "cassette_name": "test_concurrent_subagents_produce_parallel_llm_spans_with_correct_parenting",
+  "events": [
+    {
+      "op": "write",
+      "payload": {
+        "kind": "json",
+        "value": {
+          "request": {
+            "hooks": null,
+            "subtype": "initialize"
+          },
+          "request_id": "req_1_test_concurrent",
+          "type": "control_request"
+        }
+      }
+    },
+    {
+      "op": "read",
+      "payload": {
+        "response": {
+          "request_id": "req_1_test_concurrent",
+          "response": {
+            "account": {
+              "apiKeySource": "ANTHROPIC_API_KEY",
+              "tokenSource": "none"
+            },
+            "agents": [],
+            "available_output_styles": [],
+            "commands": [],
+            "models": []
+          },
+          "subtype": "success"
+        },
+        "type": "control_response"
+      }
+    },
+    {
+      "op": "write",
+      "payload": {
+        "kind": "json",
+        "value": {
+          "message": {
+            "content": "Run three tasks.",
+            "role": "user"
+          },
+          "parent_tool_use_id": null,
+          "session_id": "default",
+          "type": "user"
+        }
+      }
+    },
+    {
+      "op": "read",
+      "payload": {
+        "agents": [
+          "general-purpose"
+        ],
+        "apiKeySource": "ANTHROPIC_API_KEY",
+        "claude_code_version": "2.1.71",
+        "cwd": "<REDACTED_CWD>",
+        "fast_mode_state": "off",
+        "mcp_servers": [],
+        "model": "claude-haiku-4-5-20251001",
+        "output_style": "default",
+        "permissionMode": "bypassPermissions",
+        "plugins": [],
+        "session_id": "session-concurrent",
+        "skill_sets": [],
+        "subtype": "init",
+        "type": "system"
+      }
+    },
+    {
+      "op": "read",
+      "payload": {
+        "message": {
+          "content": [
+            {
+              "id": "toolu_agent_a",
+              "input": {
+                "description": "Task A",
+                "subagent_type": "general-purpose"
+              },
+              "name": "Agent",
+              "type": "tool_use"
+            },
+            {
+              "id": "toolu_agent_b",
+              "input": {
+                "description": "Task B",
+                "subagent_type": "general-purpose"
+              },
+              "name": "Agent",
+              "type": "tool_use"
+            },
+            {
+              "id": "toolu_agent_c",
+              "input": {
+                "description": "Task C",
+                "subagent_type": "general-purpose"
+              },
+              "name": "Agent",
+              "type": "tool_use"
+            }
+          ],
+          "model": "claude-haiku-4-5-20251001"
+        },
+        "parent_tool_use_id": null,
+        "type": "assistant"
+      }
+    },
+    {
+      "op": "read",
+      "payload": {
+        "description": "Task A",
+        "session_id": "session-concurrent",
+        "subtype": "task_started",
+        "task_id": "task_a",
+        "task_type": "local_agent",
+        "tool_use_id": "toolu_agent_a",
+        "type": "system",
+        "uuid": "uuid-A-start"
+      }
+    },
+    {
+      "op": "read",
+      "payload": {
+        "message": {
+          "content": [
+            {
+              "text": "Prompt for A.",
+              "type": "text"
+            }
+          ],
+          "role": "user"
+        },
+        "parent_tool_use_id": "toolu_agent_a",
+        "type": "user"
+      }
+    },
+    {
+      "op": "read",
+      "payload": {
+        "description": "Task B",
+        "session_id": "session-concurrent",
+        "subtype": "task_started",
+        "task_id": "task_b",
+        "task_type": "local_agent",
+        "tool_use_id": "toolu_agent_b",
+        "type": "system",
+        "uuid": "uuid-B-start"
+      }
+    },
+    {
+      "op": "read",
+      "payload": {
+        "message": {
+          "content": [
+            {
+              "text": "Prompt for B.",
+              "type": "text"
+            }
+          ],
+          "role": "user"
+        },
+        "parent_tool_use_id": "toolu_agent_b",
+        "type": "user"
+      }
+    },
+    {
+      "op": "read",
+      "payload": {
+        "description": "Task C",
+        "session_id": "session-concurrent",
+        "subtype": "task_started",
+        "task_id": "task_c",
+        "task_type": "local_agent",
+        "tool_use_id": "toolu_agent_c",
+        "type": "system",
+        "uuid": "uuid-C-start"
+      }
+    },
+    {
+      "op": "read",
+      "payload": {
+        "message": {
+          "content": [
+            {
+              "text": "Prompt for C.",
+              "type": "text"
+            }
+          ],
+          "role": "user"
+        },
+        "parent_tool_use_id": "toolu_agent_c",
+        "type": "user"
+      }
+    },
+    {
+      "op": "read",
+      "payload": {
+        "message": {
+          "content": [
+            {
+              "id": "toolu_tool_a1",
+              "input": {
+                "command": "echo a1"
+              },
+              "name": "Bash",
+              "type": "tool_use"
+            }
+          ],
+          "model": "claude-haiku-4-5-20251001"
+        },
+        "parent_tool_use_id": "toolu_agent_a",
+        "type": "assistant"
+      }
+    },
+    {
+      "op": "read",
+      "payload": {
+        "message": {
+          "content": [
+            {
+              "id": "toolu_tool_b1",
+              "input": {
+                "command": "echo b1"
+              },
+              "name": "Bash",
+              "type": "tool_use"
+            }
+          ],
+          "model": "claude-haiku-4-5-20251001"
+        },
+        "parent_tool_use_id": "toolu_agent_b",
+        "type": "assistant"
+      }
+    },
+    {
+      "op": "read",
+      "payload": {
+        "message": {
+          "content": [
+            {
+              "id": "toolu_tool_c1",
+              "input": {
+                "q": "c1"
+              },
+              "name": "mcp__server__remote_tool",
+              "type": "tool_use"
+            }
+          ],
+          "model": "claude-haiku-4-5-20251001"
+        },
+        "parent_tool_use_id": "toolu_agent_c",
+        "type": "assistant"
+      }
+    },
+    {
+      "op": "read",
+      "payload": {
+        "message": {
+          "content": [
+            {
+              "content": "a1-output",
+              "tool_use_id": "toolu_tool_a1",
+              "type": "tool_result"
+            }
+          ],
+          "role": "user"
+        },
+        "parent_tool_use_id": "toolu_agent_a",
+        "type": "user"
+      }
+    },
+    {
+      "op": "read",
+      "payload": {
+        "message": {
+          "content": [
+            {
+              "content": "b1-output",
+              "tool_use_id": "toolu_tool_b1",
+              "type": "tool_result"
+            }
+          ],
+          "role": "user"
+        },
+        "parent_tool_use_id": "toolu_agent_b",
+        "type": "user"
+      }
+    },
+    {
+      "op": "read",
+      "payload": {
+        "message": {
+          "content": [
+            {
+              "content": "c1-output",
+              "tool_use_id": "toolu_tool_c1",
+              "type": "tool_result"
+            }
+          ],
+          "role": "user"
+        },
+        "parent_tool_use_id": "toolu_agent_c",
+        "type": "user"
+      }
+    },
+    {
+      "op": "read",
+      "payload": {
+        "message": {
+          "content": [
+            {
+              "id": "toolu_tool_a2",
+              "input": {
+                "file_path": "/tmp/a.txt"
+              },
+              "name": "Read",
+              "type": "tool_use"
+            }
+          ],
+          "model": "claude-haiku-4-5-20251001"
+        },
+        "parent_tool_use_id": "toolu_agent_a",
+        "type": "assistant"
+      }
+    },
+    {
+      "op": "read",
+      "payload": {
+        "message": {
+          "content": [
+            {
+              "id": "toolu_tool_b2",
+              "input": {
+                "file_path": "/tmp/b.txt"
+              },
+              "name": "Read",
+              "type": "tool_use"
+            }
+          ],
+          "model": "claude-haiku-4-5-20251001"
+        },
+        "parent_tool_use_id": "toolu_agent_b",
+        "type": "assistant"
+      }
+    },
+    {
+      "op": "read",
+      "payload": {
+        "message": {
+          "content": [
+            {
+              "id": "toolu_tool_c2",
+              "input": {
+                "file_path": "/tmp/c.txt"
+              },
+              "name": "Read",
+              "type": "tool_use"
+            }
+          ],
+          "model": "claude-haiku-4-5-20251001"
+        },
+        "parent_tool_use_id": "toolu_agent_c",
+        "type": "assistant"
+      }
+    },
+    {
+      "op": "read",
+      "payload": {
+        "message": {
+          "content": [
+            {
+              "content": "a2-output",
+              "tool_use_id": "toolu_tool_a2",
+              "type": "tool_result"
+            }
+          ],
+          "role": "user"
+        },
+        "parent_tool_use_id": "toolu_agent_a",
+        "type": "user"
+      }
+    },
+    {
+      "op": "read",
+      "payload": {
+        "message": {
+          "content": [
+            {
+              "content": "b2-output",
+              "tool_use_id": "toolu_tool_b2",
+              "type": "tool_result"
+            }
+          ],
+          "role": "user"
+        },
+        "parent_tool_use_id": "toolu_agent_b",
+        "type": "user"
+      }
+    },
+    {
+      "op": "read",
+      "payload": {
+        "message": {
+          "content": [
+            {
+              "content": "c2-output",
+              "tool_use_id": "toolu_tool_c2",
+              "type": "tool_result"
+            }
+          ],
+          "role": "user"
+        },
+        "parent_tool_use_id": "toolu_agent_c",
+        "type": "user"
+      }
+    },
+    {
+      "op": "read",
+      "payload": {
+        "output_file": "",
+        "session_id": "session-concurrent",
+        "status": "completed",
+        "subtype": "task_notification",
+        "summary": "A done",
+        "task_id": "task_a",
+        "tool_use_id": "toolu_agent_a",
+        "type": "system",
+        "usage": {
+          "duration_ms": 500,
+          "tool_uses": 2,
+          "total_tokens": 100
+        },
+        "uuid": "uuid-A-done"
+      }
+    },
+    {
+      "op": "read",
+      "payload": {
+        "output_file": "",
+        "session_id": "session-concurrent",
+        "status": "completed",
+        "subtype": "task_notification",
+        "summary": "B done",
+        "task_id": "task_b",
+        "tool_use_id": "toolu_agent_b",
+        "type": "system",
+        "usage": {
+          "duration_ms": 500,
+          "tool_uses": 2,
+          "total_tokens": 100
+        },
+        "uuid": "uuid-B-done"
+      }
+    },
+    {
+      "op": "read",
+      "payload": {
+        "output_file": "",
+        "session_id": "session-concurrent",
+        "status": "completed",
+        "subtype": "task_notification",
+        "summary": "C done",
+        "task_id": "task_c",
+        "tool_use_id": "toolu_agent_c",
+        "type": "system",
+        "usage": {
+          "duration_ms": 500,
+          "tool_uses": 2,
+          "total_tokens": 100
+        },
+        "uuid": "uuid-C-done"
+      }
+    },
+    {
+      "op": "read",
+      "payload": {
+        "message": {
+          "content": [
+            {
+              "content": "A complete",
+              "tool_use_id": "toolu_agent_a",
+              "type": "tool_result"
+            },
+            {
+              "content": "B complete",
+              "tool_use_id": "toolu_agent_b",
+              "type": "tool_result"
+            },
+            {
+              "content": "C complete",
+              "tool_use_id": "toolu_agent_c",
+              "type": "tool_result"
+            }
+          ],
+          "role": "user"
+        },
+        "parent_tool_use_id": null,
+        "type": "user"
+      }
+    },
+    {
+      "op": "read",
+      "payload": {
+        "message": {
+          "content": [
+            {
+              "text": "Done.",
+              "type": "text"
+            }
+          ],
+          "model": "claude-haiku-4-5-20251001"
+        },
+        "parent_tool_use_id": null,
+        "type": "assistant"
+      }
+    },
+    {
+      "op": "read",
+      "payload": {
+        "duration_api_ms": 3000,
+        "duration_ms": 5000,
+        "fast_mode_state": "off",
+        "is_error": false,
+        "num_turns": 3,
+        "permission_denials": [],
+        "result": "Done.",
+        "session_id": "session-concurrent",
+        "stop_reason": "end_turn",
+        "subtype": "success",
+        "total_cost_usd": 0.001,
+        "type": "result",
+        "usage": {
+          "cache_creation_input_tokens": 0,
+          "cache_read_input_tokens": 0,
+          "input_tokens": 200,
+          "output_tokens": 50,
+          "service_tier": "standard",
+          "speed": "standard"
+        },
+        "uuid": "uuid-result"
+      }
+    }
+  ],
+  "sdk_version": "0.1.48"
+}
diff --git a/py/src/braintrust/wrappers/claude_agent_sdk/cassettes/test_interleaved_subagent_tool_output_preserved.json b/py/src/braintrust/wrappers/claude_agent_sdk/cassettes/test_interleaved_subagent_tool_output_preserved.json
new file mode 100644
index 00000000..1317e35b
--- /dev/null
+++ b/py/src/braintrust/wrappers/claude_agent_sdk/cassettes/test_interleaved_subagent_tool_output_preserved.json
@@ -0,0 +1,340 @@
+{
+  "cassette_name": "test_interleaved_subagent_tool_output_preserved",
+  "events": [
+    {
+      "op": "write",
+      "payload": {
+        "kind": "json",
+        "value": {
+          "request": {
+            "hooks": null,
+            "subtype": "initialize"
+          },
+          "request_id": "req_1_test_interleave",
+          "type": "control_request"
+        }
+      }
+    },
+    {
+      "op": "read",
+      "payload": {
+        "response": {
+          "request_id": "req_1_test_interleave",
+          "response": {
+            "account": {
+              "apiKeySource": "ANTHROPIC_API_KEY",
+              "tokenSource": "none"
+            },
+            "agents": [],
+            "available_output_styles": [],
+            "commands": [],
+            "models": []
+          },
+          "subtype": "success"
+        },
+        "type": "control_response"
+      }
+    },
+    {
+      "op": "write",
+      "payload": {
+        "kind": "json",
+        "value": {
+          "message": {
+            "content": "Launch two subagents to process files.",
+            "role": "user"
+          },
+          "parent_tool_use_id": null,
+          "session_id": "default",
+          "type": "user"
+        }
+      }
+    },
+    {
+      "op": "read",
+      "payload": {
+        "agents": [
+          "general-purpose"
+        ],
+        "apiKeySource": "ANTHROPIC_API_KEY",
+        "claude_code_version": "2.1.71",
+        "cwd": "<REDACTED_CWD>",
+        "fast_mode_state": "off",
+        "mcp_servers": [],
+        "model": "claude-haiku-4-5-20251001",
+        "output_style": "default",
+        "permissionMode": "bypassPermissions",
+        "plugins": [],
+        "session_id": "session-interleave-test",
+        "skill_sets": [],
+        "subtype": "init",
+        "type": "system"
+      }
+    },
+    {
+      "op": "read",
+      "payload": {
+        "message": {
+          "content": [
+            {
+              "id": "toolu_agent_alpha",
+              "input": {
+                "description": "Process alpha file",
+                "subagent_type": "general-purpose"
+              },
+              "name": "Agent",
+              "type": "tool_use"
+            },
+            {
+              "id": "toolu_agent_beta",
+              "input": {
+                "description": "Process beta file",
+                "subagent_type": "general-purpose"
+              },
+              "name": "Agent",
+              "type": "tool_use"
+            }
+          ],
+          "model": "claude-haiku-4-5-20251001"
+        },
+        "parent_tool_use_id": null,
+        "type": "assistant"
+      }
+    },
+    {
+      "op": "read",
+      "payload": {
+        "description": "Process alpha file",
+        "session_id": "session-interleave-test",
+        "subtype": "task_started",
+        "task_id": "task_alpha_001",
+        "task_type": "local_agent",
+        "tool_use_id": "toolu_agent_alpha",
+        "type": "system",
+        "uuid": "uuid-alpha-start"
+      }
+    },
+    {
+      "op": "read",
+      "payload": {
+        "message": {
+          "content": [
+            {
+              "text": "Process the alpha file.",
+              "type": "text"
+            }
+          ],
+          "role": "user"
+        },
+        "parent_tool_use_id": "toolu_agent_alpha",
+        "type": "user"
+      }
+    },
+    {
+      "op": "read",
+      "payload": {
+        "description": "Process beta file",
+        "session_id": "session-interleave-test",
+        "subtype": "task_started",
+        "task_id": "task_beta_001",
+        "task_type": "local_agent",
+        "tool_use_id": "toolu_agent_beta",
+        "type": "system",
+        "uuid": "uuid-beta-start"
+      }
+    },
+    {
+      "op": "read",
+      "payload": {
+        "message": {
+          "content": [
+            {
+              "text": "Process the beta file.",
+              "type": "text"
+            }
+          ],
+          "role": "user"
+        },
+        "parent_tool_use_id": "toolu_agent_beta",
+        "type": "user"
+      }
+    },
+    {
+      "op": "read",
+      "payload": {
+        "message": {
+          "content": [
+            {
+              "id": "toolu_bash_alpha",
+              "input": {
+                "command": "cat /tmp/alpha.txt"
+              },
+              "name": "Bash",
+              "type": "tool_use"
+            }
+          ],
+          "model": "claude-haiku-4-5-20251001"
+        },
+        "parent_tool_use_id": "toolu_agent_alpha",
+        "type": "assistant"
+      }
+    },
+    {
+      "op": "read",
+      "payload": {
+        "message": {
+          "content": [
+            {
+              "id": "toolu_read_beta",
+              "input": {
+                "file_path": "/tmp/beta.txt"
+              },
+              "name": "Read",
+              "type": "tool_use"
+            }
+          ],
+          "model": "claude-haiku-4-5-20251001"
+        },
+        "parent_tool_use_id": "toolu_agent_beta",
+        "type": "assistant"
+      }
+    },
+    {
+      "op": "read",
+      "payload": {
+        "message": {
+          "content": [
+            {
+              "content": "alpha_file_contents",
+              "tool_use_id": "toolu_bash_alpha",
+              "type": "tool_result"
+            }
+          ],
+          "role": "user"
+        },
+        "parent_tool_use_id": "toolu_agent_alpha",
+        "type": "user"
+      }
+    },
+    {
+      "op": "read",
+      "payload": {
+        "message": {
+          "content": [
+            {
+              "content": "beta_file_contents",
+              "tool_use_id": "toolu_read_beta",
+              "type": "tool_result"
+            }
+          ],
+          "role": "user"
+        },
+        "parent_tool_use_id": "toolu_agent_beta",
+        "type": "user"
+      }
+    },
+    {
+      "op": "read",
+      "payload": {
+        "output_file": "",
+        "session_id": "session-interleave-test",
+        "status": "completed",
+        "subtype": "task_notification",
+        "summary": "Alpha processed",
+        "task_id": "task_alpha_001",
+        "tool_use_id": "toolu_agent_alpha",
+        "type": "system",
+        "usage": {
+          "duration_ms": 500,
+          "tool_uses": 1,
+          "total_tokens": 100
+        },
+        "uuid": "uuid-alpha-done"
+      }
+    },
+    {
+      "op": "read",
+      "payload": {
+        "output_file": "",
+        "session_id": "session-interleave-test",
+        "status": "completed",
+        "subtype": "task_notification",
+        "summary": "Beta processed",
+        "task_id": "task_beta_001",
+        "tool_use_id": "toolu_agent_beta",
+        "type": "system",
+        "usage": {
+          "duration_ms": 600,
+          "tool_uses": 1,
+          "total_tokens": 120
+        },
+        "uuid": "uuid-beta-done"
+      }
+    },
+    {
+      "op": "read",
+      "payload": {
+        "message": {
+          "content": [
+            {
+              "content": "alpha processed",
+              "tool_use_id": "toolu_agent_alpha",
+              "type": "tool_result"
+            },
+            {
+              "content": "beta processed",
+              "tool_use_id": "toolu_agent_beta",
+              "type": "tool_result"
+            }
+          ],
+          "role": "user"
+        },
+        "parent_tool_use_id": null,
+        "type": "user"
+      }
+    },
+    {
+      "op": "read",
+      "payload": {
+        "message": {
+          "content": [
+            {
+              "text": "Both files have been processed.",
+              "type": "text"
+            }
+          ],
+          "model": "claude-haiku-4-5-20251001"
+        },
+        "parent_tool_use_id": null,
+        "type": "assistant"
+      }
+    },
+    {
+      "op": "read",
+      "payload": {
+        "duration_api_ms": 3000,
+        "duration_ms": 5000,
+        "fast_mode_state": "off",
+        "is_error": false,
+        "num_turns": 3,
+        "permission_denials": [],
+        "result": "Both files have been processed.",
+        "session_id": "session-interleave-test",
+        "stop_reason": "end_turn",
+        "subtype": "success",
+        "total_cost_usd": 0.001,
+        "type": "result",
+        "usage": {
+          "cache_creation_input_tokens": 0,
+          "cache_read_input_tokens": 0,
+          "input_tokens": 200,
+          "output_tokens": 50,
+          "service_tier": "standard",
+          "speed": "standard"
+        },
+        "uuid": "uuid-result"
+      }
+    }
+  ],
+  "sdk_version": "0.1.48"
+}
diff --git a/py/src/braintrust/wrappers/claude_agent_sdk/test_wrapper.py b/py/src/braintrust/wrappers/claude_agent_sdk/test_wrapper.py
index eb12fa3d..e3d82bfd 100644
--- a/py/src/braintrust/wrappers/claude_agent_sdk/test_wrapper.py
+++ b/py/src/braintrust/wrappers/claude_agent_sdk/test_wrapper.py
@@ -1862,3 +1862,536 @@ async def main() -> None:
     assert len(task_spans) == 1
     assert task_spans[0]["span_attributes"]["name"] == "Claude Agent"
     assert task_spans[0]["input"] == "Say hi"
+
+
+@pytest.mark.skipif(not CLAUDE_SDK_AVAILABLE, reason="Claude Agent SDK not installed")
+@pytest.mark.asyncio
+async def test_concurrent_subagents_produce_parallel_llm_spans_with_correct_parenting(memory_logger):
+    """Concurrent subagent LLM spans must run in parallel, not be serialized into a single
+    sequential chain — and every tool span must be parented to its own subagent's LLM span
+    with output preserved.
+
+    Three subagents each perform two interleaved tool rounds:
+      LLM(A:Bash) → LLM(B:Bash) → LLM(C:MCP tool) → result(A) → result(B) → result(C)
+      LLM(A:Read) → LLM(B:Read) → LLM(C:Read)      → result(A) → result(B) → result(C)
+
+    Verifies:
+    - Each subagent gets its own LLM spans (not shared with other subagents)
+    - LLM spans from different subagents overlap in time (parallel execution)
+    - Tool spans are parented to the correct subagent's LLM span
+    - Tool output is preserved despite cross-subagent message interleaving
+    """
+    assert not memory_logger.pop()
+
+    subagents = [
+        {"label": "A", "agent_id": "toolu_agent_a", "task_id": "task_a"},
+        {"label": "B", "agent_id": "toolu_agent_b", "task_id": "task_b"},
+        {"label": "C", "agent_id": "toolu_agent_c", "task_id": "task_c"},
+    ]
+    round1_tools = [
+        {"id": "toolu_tool_a1", "name": "Bash", "agent_id": "toolu_agent_a", "result": "a1-output"},
+        {"id": "toolu_tool_b1", "name": "Bash", "agent_id": "toolu_agent_b", "result": "b1-output"},
+        {
+            "id": "toolu_tool_c1",
+            "name": "mcp__server__remote_tool",
+            "agent_id": "toolu_agent_c",
+            "result": "c1-output",
+        },
+    ]
+    round2_tools = [
+        {"id": "toolu_tool_a2", "name": "Read", "agent_id": "toolu_agent_a", "result": "a2-output"},
+        {"id": "toolu_tool_b2", "name": "Read", "agent_id": "toolu_agent_b", "result": "b2-output"},
+        {"id": "toolu_tool_c2", "name": "Read", "agent_id": "toolu_agent_c", "result": "c2-output"},
+    ]
+    all_tools = round1_tools + round2_tools
+
+    with _patched_claude_sdk(wrap_client=True):
+        options = claude_agent_sdk.ClaudeAgentOptions(
+            model=TEST_MODEL,
+            permission_mode="bypassPermissions",
+        )
+        transport = make_cassette_transport(
+            cassette_name="test_concurrent_subagents_produce_parallel_llm_spans_with_correct_parenting",
+            prompt="",
+            options=options,
+        )
+
+        async with claude_agent_sdk.ClaudeSDKClient(options=options, transport=transport) as client:
+            await client.query("Run three tasks.")
+            async for message in client.receive_response():
+                if type(message).__name__ == "ResultMessage":
+                    break
+
+    spans = memory_logger.pop()
+    task_spans = _find_spans_by_type(spans, SpanTypeAttribute.TASK)
+    llm_spans = _find_spans_by_type(spans, SpanTypeAttribute.LLM)
+    tool_spans = _find_spans_by_type(spans, SpanTypeAttribute.TOOL)
+
+    all_tools = round1_tools + round2_tools
+
+    # --- 1. All subagent TASK spans exist ---
+    _find_span_by_name(task_spans, "Claude Agent")
+    subagent_task_by_label: dict[str, dict[str, Any]] = {}
+    for sa in subagents:
+        subagent_task_by_label[sa["label"]] = _find_span_by_name(task_spans, f"Task {sa['label']}")
+
+    task_id_by_span = {t["span_id"]: label for label, t in subagent_task_by_label.items()}
+
+    # --- 2. Every tool span has output ---
+    non_agent_tools = [s for s in tool_spans if s["span_attributes"]["name"] != "Agent"]
+    tools_without_output = [s for s in non_agent_tools if s.get("output") is None]
+    assert not tools_without_output, (
+        f"{len(tools_without_output)} of {len(non_agent_tools)} tool spans lost their output. "
+        f"Missing: {[s['span_attributes']['name'] + '(' + s.get('metadata', {}).get('gen_ai.tool.call.id', '?') + ')' for s in tools_without_output]}"
+    )
+
+    # --- 3. Tool spans are parented to the correct subagent's LLM span ---
+    agent_id_to_label = {sa["agent_id"]: sa["label"] for sa in subagents}
+    tool_id_to_label = {t["id"]: agent_id_to_label[t["agent_id"]] for t in all_tools}
+
+    for tool in non_agent_tools:
+        tool_call_id = tool.get("metadata", {}).get("gen_ai.tool.call.id", "")
+        expected_label = tool_id_to_label.get(tool_call_id)
+        if expected_label is None:
+            continue
+
+        parent_llm = next((s for s in llm_spans if s["span_id"] == tool["span_parents"][0]), None)
+        assert parent_llm is not None, f"Tool {tool_call_id} has no parent LLM span"
+
+        llm_task_parent_id = parent_llm["span_parents"][0]
+        actual_label = task_id_by_span.get(llm_task_parent_id)
+        assert actual_label == expected_label, (
+            f"Tool {tool_call_id} should be under subagent {expected_label}, got {actual_label}"
+        )
+
+    # --- 4. Correct tool output content ---
+    for t in all_tools:
+        span = next(s for s in tool_spans if s.get("metadata", {}).get("gen_ai.tool.call.id") == t["id"])
+        assert span["output"]["content"] == t["result"]
+
+    # MCP tool name should be parsed
+    mcp_span = next(s for s in tool_spans if s.get("metadata", {}).get("gen_ai.tool.call.id") == "toolu_tool_c1")
+    assert mcp_span["span_attributes"]["name"] == "remote_tool"
+    assert mcp_span["metadata"].get("mcp.server") == "server"
+
+    # --- 5. Scale check ---
+    assert len(non_agent_tools) == 6
+    assert len(llm_spans) >= 7
+    assert len(task_spans) == 4
+
+    # --- 6. LLM spans from different subagents overlap (not serialized) ---
+    subagent_llm_spans: dict[str, list[dict[str, Any]]] = {sa["label"]: [] for sa in subagents}
+    for llm_span in llm_spans:
+        label = task_id_by_span.get(llm_span["span_parents"][0])
+        if label:
+            subagent_llm_spans[label].append(llm_span)
+
+    for label, llms in subagent_llm_spans.items():
+        assert len(llms) == 2, f"Expected 2 LLM spans for subagent {label} (one per tool round), got {len(llms)}"
+
+    a_first = min(subagent_llm_spans["A"], key=lambda s: s["metrics"]["start"])
+    b_first = min(subagent_llm_spans["B"], key=lambda s: s["metrics"]["start"])
+    assert a_first["metrics"]["end"] > b_first["metrics"]["start"], (
+        f"Subagent A's first LLM span should overlap with B's (not be truncated). "
+        f"A end={a_first['metrics']['end']}, B start={b_first['metrics']['start']}"
+    )
+
+    # --- 7. Tool spans fit within their parent LLM span ---
+    for tool in non_agent_tools:
+        parent_llm = next((s for s in llm_spans if s["span_id"] == tool["span_parents"][0]), None)
+        if parent_llm and "end" in parent_llm.get("metrics", {}):
+            assert tool["metrics"]["start"] >= parent_llm["metrics"]["start"], "Tool starts before parent LLM"
+            assert tool["metrics"]["end"] <= parent_llm["metrics"]["end"], "Tool extends past parent LLM"
+
+
+@pytest.mark.skipif(not CLAUDE_SDK_AVAILABLE, reason="Claude Agent SDK not installed")
+@pytest.mark.asyncio
+async def test_interleaved_subagent_tool_spans_preserve_output(memory_logger):
+    """Cassette-backed test: tool spans from one subagent must retain their
+    output when another subagent's AssistantMessage arrives before the first
+    subagent's ToolResultBlock.
+
+    The cassette replays a realistic SDK message stream where:
+      1. Orchestrator launches subagent-alpha and subagent-beta
+      2. Alpha's LLM turn emits a Bash tool call
+      3. Beta's LLM turn emits a Read tool call BEFORE alpha's tool result
+      4. Alpha's tool result arrives
+      5. Beta's tool result arrives
+
+    Expected: Both Bash and Read tool spans should have their output recorded.
+    Bug: cleanup() in receive_response force-ends alpha's Bash tool span when
+    beta's AssistantMessage arrives, so alpha's ToolResultBlock is silently
+    skipped and its output is lost.
+    """
+    assert not memory_logger.pop()
+
+    with _patched_claude_sdk(wrap_client=True):
+        options = claude_agent_sdk.ClaudeAgentOptions(
+            model=TEST_MODEL,
+            permission_mode="bypassPermissions",
+        )
+        transport = make_cassette_transport(
+            cassette_name="test_interleaved_subagent_tool_output_preserved",
+            prompt="",
+            options=options,
+        )
+
+        async with claude_agent_sdk.ClaudeSDKClient(options=options, transport=transport) as client:
+            await client.query("Launch two subagents to process files.")
+            async for message in client.receive_response():
+                if type(message).__name__ == "ResultMessage":
+                    break
+
+    spans = memory_logger.pop()
+    tool_spans = _find_spans_by_type(spans, SpanTypeAttribute.TOOL)
+
+    bash_span = _find_span_by_name(tool_spans, "Bash")
+    read_span = _find_span_by_name(tool_spans, "Read")
+
+    # Both tool spans should have their output recorded
+    assert bash_span.get("output") is not None, (
+        "Bash tool span output was lost — the cleanup force-ended it before its ToolResultBlock arrived"
+    )
+    assert bash_span["output"]["content"] == "alpha_file_contents"
+
+    assert read_span.get("output") is not None, (
+        "Read tool span output was lost — the cleanup force-ended it before its ToolResultBlock arrived"
+    )
+    assert read_span["output"]["content"] == "beta_file_contents"
+
+
+@pytest.mark.skipif(not CLAUDE_SDK_AVAILABLE, reason="Claude Agent SDK not installed")
+@pytest.mark.asyncio
+async def test_interleaved_subagent_tool_spans_parent_to_correct_llm(memory_logger):
+    """Cassette-backed test: tool spans from interleaved subagents must be
+    parented to the LLM span from their own subagent, not the most recent
+    LLM span from any subagent.
+
+    Uses the same interleaved cassette to verify that even when messages from
+    different subagents interleave on the single message stream, each tool span
+    references the correct LLM parent via parent_tool_use_id routing.
+    """
+    assert not memory_logger.pop()
+
+    with _patched_claude_sdk(wrap_client=True):
+        options = claude_agent_sdk.ClaudeAgentOptions(
+            model=TEST_MODEL,
+            permission_mode="bypassPermissions",
+        )
+        transport = make_cassette_transport(
+            cassette_name="test_interleaved_subagent_tool_output_preserved",
+            prompt="",
+            options=options,
+        )
+
+        async with claude_agent_sdk.ClaudeSDKClient(options=options, transport=transport) as client:
+            await client.query("Launch two subagents to process files.")
+            async for message in client.receive_response():
+                if type(message).__name__ == "ResultMessage":
+                    break
+
+    spans = memory_logger.pop()
+    llm_spans = _find_spans_by_type(spans, SpanTypeAttribute.LLM)
+    tool_spans = _find_spans_by_type(spans, SpanTypeAttribute.TOOL)
+    task_spans = _find_spans_by_type(spans, SpanTypeAttribute.TASK)
+
+    alpha_task = _find_span_by_name(task_spans, "Process alpha file")
+    beta_task = _find_span_by_name(task_spans, "Process beta file")
+
+    bash_span = _find_span_by_name(tool_spans, "Bash")
+    read_span = _find_span_by_name(tool_spans, "Read")
+
+    # Find each tool's parent LLM span
+    bash_parent_llm_id = bash_span["span_parents"][0]
+    read_parent_llm_id = read_span["span_parents"][0]
+
+    bash_parent_llm = next(s for s in llm_spans if s["span_id"] == bash_parent_llm_id)
+    read_parent_llm = next(s for s in llm_spans if s["span_id"] == read_parent_llm_id)
+
+    # Bash's parent LLM should be under alpha's task
+    assert alpha_task["span_id"] in bash_parent_llm["span_parents"], (
+        f"Bash's parent LLM span should be under alpha task, but its parents are {bash_parent_llm['span_parents']}"
+    )
+
+    # Read's parent LLM should be under beta's task
+    assert beta_task["span_id"] in read_parent_llm["span_parents"], (
+        f"Read's parent LLM span should be under beta task, but its parents are {read_parent_llm['span_parents']}"
+    )
+
+    # The two tool spans should have DIFFERENT LLM parents (not shared)
+    assert bash_parent_llm_id != read_parent_llm_id, (
+        "Tool spans from different subagents should be parented to different LLM spans"
+    )
+
+
+@pytest.mark.asyncio
+async def test_concurrent_subagent_tool_output_not_silently_dropped(memory_logger):
+    """cleanup() scoped to a different subagent must not end tool spans from
+    the first subagent.  When only_parent_tool_use_id targets beta's context,
+    alpha's Bash tool span must survive so its ToolResultBlock is recorded.
+    """
+    assert not memory_logger.pop()
+
+    tracker = ToolSpanTracker()
+
+    with start_span(name="Claude Agent", type=SpanTypeAttribute.TASK) as task_span:
+        # Alpha's LLM span and Bash tool span (parent_tool_use_id="call-alpha")
+        llm_span = start_span(
+            name="anthropic.messages.create",
+            type=SpanTypeAttribute.LLM,
+            parent=task_span.export(),
+        )
+        tracker.start_tool_spans(
+            AssistantMessage(
+                content=[ToolUseBlock(id="bash-1", name="Bash", input={"command": "echo hello"})],
+                parent_tool_use_id="call-alpha",
+            ),
+            llm_span.export(),
+        )
+
+        assert tracker.has_active_spans, "Tool span should be active after start_tool_spans"
+
+        # Cleanup triggered by beta's AssistantMessage — scoped to beta's context
+        tracker.cleanup(only_parent_tool_use_id="call-beta")
+
+        # Alpha's tool span should still be active
+        assert tracker.has_active_spans, (
+            "cleanup(only_parent_tool_use_id='call-beta') should not end alpha's tool span"
+        )
+
+        # Alpha's ToolResultBlock arrives and should be recorded
+        tracker.finish_tool_spans(
+            UserMessage(content=[ToolResultBlock(tool_use_id="bash-1", content=[TextBlock("hello")])])
+        )
+        llm_span.end()
+
+    spans = memory_logger.pop()
+    bash_span = _find_span_by_name(
+        [s for s in spans if s.get("span_attributes", {}).get("type") == SpanTypeAttribute.TOOL],
+        "Bash",
+    )
+
+    assert bash_span.get("output") is not None, (
+        "Tool result was silently dropped. cleanup() scoped to a different subagent "
+        "should not have ended this tool span."
+    )
+    assert bash_span["output"]["content"] == "hello"
+
+
+def test_tool_span_tracker_cleanup_preserves_cross_subagent_spans(memory_logger):
+    """cleanup(only_parent_tool_use_id=...) should not end tool spans that
+    belong to a different subagent context.
+
+    Alpha starts a Bash tool span.  A cleanup scoped to beta's context fires.
+    Alpha's span must survive so its ToolResultBlock is recorded.
+    """
+    assert not memory_logger.pop()
+
+    tracker = ToolSpanTracker()
+
+    with start_span(name="Claude Agent", type=SpanTypeAttribute.TASK) as task_span:
+        # Alpha's LLM span and tool span
+        alpha_llm = start_span(
+            name="anthropic.messages.create",
+            type=SpanTypeAttribute.LLM,
+            parent=task_span.export(),
+        )
+        tracker.start_tool_spans(
+            AssistantMessage(
+                content=[ToolUseBlock(id="bash-alpha", name="Bash", input={"command": "echo alpha"})],
+                parent_tool_use_id="call-alpha",
+            ),
+            alpha_llm.export(),
+        )
+
+        # Cleanup triggered by beta's AssistantMessage — scoped to beta
+        tracker.cleanup(only_parent_tool_use_id="call-beta")
+
+        # Alpha's span should still be active
+        assert tracker.has_active_spans, "Alpha's tool span should survive beta-scoped cleanup"
+
+        # Alpha's tool result arrives
+        tracker.finish_tool_spans(
+            UserMessage(content=[ToolResultBlock(tool_use_id="bash-alpha", content=[TextBlock("alpha output")])])
+        )
+        alpha_llm.end()
+
+    spans = memory_logger.pop()
+    bash_spans = [s for s in spans if s.get("span_attributes", {}).get("name") == "Bash"]
+    assert len(bash_spans) == 1
+    bash_span = bash_spans[0]
+
+    assert bash_span.get("output") is not None, (
+        "Tool span output was lost because cleanup() ended a span from a different subagent context."
+    )
+    assert bash_span["output"]["content"] == "alpha output"
+
+
+@pytest.mark.asyncio
+async def test_identical_concurrent_tool_calls_from_sibling_subagents_disambiguated(memory_logger):
+    """When two sibling subagents invoke the same tool with the same args,
+    each handler must acquire the tool span belonging to its own subagent
+    (matched by FIFO dispatch order) rather than stealing the other's span.
+    """
+    assert not memory_logger.pop()
+
+    wrapped_tool_class = _create_tool_wrapper_class(_make_fake_sdk_mcp_tool_class())
+
+    async def echo_handler(args):
+        nested = start_span(name=f"nested_{args['_tag']}")
+        nested.log(input=args)
+        nested.end()
+        return {"content": [{"type": "text", "text": args["_tag"]}]}
+
+    echo_tool = wrapped_tool_class(
+        name="echo",
+        description="Echo a message",
+        input_schema={"type": "object"},
+        handler=echo_handler,
+    )
+
+    tracker = ToolSpanTracker()
+    shared_input = {"message": "hello", "_tag": "alpha"}
+
+    with start_span(name="Claude Agent", type=SpanTypeAttribute.TASK) as task_span:
+        # Subagent alpha's LLM span and tool span
+        alpha_llm = start_span(
+            name="anthropic.messages.create",
+            type=SpanTypeAttribute.LLM,
+            parent=task_span.export(),
+        )
+        tracker.start_tool_spans(
+            AssistantMessage(
+                content=[ToolUseBlock(id="echo-alpha", name="echo", input=shared_input)],
+                parent_tool_use_id="call-alpha",
+            ),
+            alpha_llm.export(),
+        )
+
+        # Subagent beta's LLM span and tool span — same tool, same input
+        beta_llm = start_span(
+            name="anthropic.messages.create",
+            type=SpanTypeAttribute.LLM,
+            parent=task_span.export(),
+        )
+        tracker.start_tool_spans(
+            AssistantMessage(
+                content=[ToolUseBlock(id="echo-beta", name="echo", input=shared_input)],
+                parent_tool_use_id="call-beta",
+            ),
+            beta_llm.export(),
+        )
+
+        _thread_local.tool_span_tracker = tracker
+        try:
+            # Handler for alpha fires first (FIFO order matches creation order)
+            await echo_tool.handler(shared_input)
+            # Handler for beta fires second
+            await echo_tool.handler(shared_input)
+
+            tracker.finish_tool_spans(
+                UserMessage(
+                    content=[ToolResultBlock(tool_use_id="echo-alpha", content=[TextBlock("alpha")])],
+                    parent_tool_use_id="call-alpha",
+                )
+            )
+            tracker.finish_tool_spans(
+                UserMessage(
+                    content=[ToolResultBlock(tool_use_id="echo-beta", content=[TextBlock("beta")])],
+                    parent_tool_use_id="call-beta",
+                )
+            )
+        finally:
+            _clear_tool_span_tracker()
+            tracker.cleanup()
+            alpha_llm.end()
+            beta_llm.end()
+
+    spans = memory_logger.pop()
+    echo_spans = [
+        s for s in _find_spans_by_type(spans, SpanTypeAttribute.TOOL) if s["span_attributes"]["name"] == "echo"
+    ]
+    assert len(echo_spans) == 2, f"Expected 2 echo tool spans, got {len(echo_spans)}"
+
+    # Identify which span belongs to alpha's and beta's tool call
+    alpha_echo = [s for s in echo_spans if s.get("metadata", {}).get("gen_ai.tool.call.id") == "echo-alpha"]
+    beta_echo = [s for s in echo_spans if s.get("metadata", {}).get("gen_ai.tool.call.id") == "echo-beta"]
+    assert len(alpha_echo) == 1, "Should have exactly one alpha echo span"
+    assert len(beta_echo) == 1, "Should have exactly one beta echo span"
+
+    # Both handlers receive the same input with _tag="alpha", so both nested
+    # spans are named "nested_alpha".  Find both by filtering.
+    nested_spans = [s for s in spans if s["span_attributes"]["name"] == "nested_alpha"]
+    assert len(nested_spans) == 2, f"Expected 2 nested spans, got {len(nested_spans)}"
+
+    # The first handler invocation should nest under the first span (alpha),
+    # and the second under the second span (beta).
+    first_nested = nested_spans[0]
+    assert alpha_echo[0]["span_id"] in first_nested["span_parents"], (
+        "First handler's nested span should be parented under alpha's echo tool span, not swapped with beta's."
+    )
+    second_nested = nested_spans[1]
+    assert beta_echo[0]["span_id"] in second_nested["span_parents"], (
+        "Second handler's nested span should be parented under beta's echo tool span, not swapped with alpha's."
+    )
+
+
+def test_dispatch_queue_assigns_identical_tool_spans_in_fifo_order(memory_logger):
+    """ToolSpanTracker.acquire_span_for_handler() should use the dispatch queue
+    to assign identical (same name + same input) tool spans in FIFO order,
+    preventing span swaps between sibling subagents.
+    """
+    assert not memory_logger.pop()
+
+    tracker = ToolSpanTracker()
+    shared_input = {"cmd": "echo hi"}
+
+    with start_span(name="Claude Agent", type=SpanTypeAttribute.TASK) as task_span:
+        llm_alpha = start_span(
+            name="anthropic.messages.create",
+            type=SpanTypeAttribute.LLM,
+            parent=task_span.export(),
+        )
+        tracker.start_tool_spans(
+            AssistantMessage(
+                content=[ToolUseBlock(id="bash-A", name="Bash", input=shared_input)],
+                parent_tool_use_id="call-alpha",
+            ),
+            llm_alpha.export(),
+        )
+
+        llm_beta = start_span(
+            name="anthropic.messages.create",
+            type=SpanTypeAttribute.LLM,
+            parent=task_span.export(),
+        )
+        tracker.start_tool_spans(
+            AssistantMessage(
+                content=[ToolUseBlock(id="bash-B", name="Bash", input=shared_input)],
+                parent_tool_use_id="call-beta",
+            ),
+            llm_beta.export(),
+        )
+
+        # First acquire should return alpha's span (FIFO)
+        first = tracker.acquire_span_for_handler("Bash", shared_input)
+        assert first is not None
+        assert first.tool_use_id == "bash-A", (
+            f"First acquire should return alpha's span (bash-A), got {first.tool_use_id}"
+        )
+
+        # Second acquire should return beta's span
+        second = tracker.acquire_span_for_handler("Bash", shared_input)
+        assert second is not None
+        assert second.tool_use_id == "bash-B", (
+            f"Second acquire should return beta's span (bash-B), got {second.tool_use_id}"
+        )
+
+        # Cleanup
+        first.release()
+        second.release()
+        tracker.cleanup()
+        llm_alpha.end()
+        llm_beta.end()
+
+    memory_logger.pop()  # consume spans

From ef8f62bc3f932a57e9a71d349e2ab447b1efd027 Mon Sep 17 00:00:00 2001
From: Abhijeet Prasad <abhijeet@braintrustdata.com>
Date: Wed, 18 Mar 2026 23:39:31 -0400
Subject: [PATCH 02/12] Remove redundant _wrap_tool_factory and tool() patch
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Phase 0 of the Claude Agent SDK instrumentation simplification plan.

WrappedSdkMcpTool.__init__ already wraps tool handlers at construction
time, so the separate tool() decorator factory patch was redundant —
every tool() invocation routes through SdkMcpTool (resolved from
tool.__globals__), which is already patched to WrappedSdkMcpTool.

Changes:
- _wrapper.py: remove _wrap_tool_factory function
- __init__.py: remove tool() patching and sys.modules sweep
- test_wrapper.py: remove tool-related save/restore and assertions
---
 .../wrappers/claude_agent_sdk/__init__.py     | 12 +----------
 .../wrappers/claude_agent_sdk/_wrapper.py     | 20 -------------------
 .../wrappers/claude_agent_sdk/test_wrapper.py |  6 ------
 3 files changed, 1 insertion(+), 37 deletions(-)

diff --git a/py/src/braintrust/wrappers/claude_agent_sdk/__init__.py b/py/src/braintrust/wrappers/claude_agent_sdk/__init__.py
index 8b596860..1d44358c 100644
--- a/py/src/braintrust/wrappers/claude_agent_sdk/__init__.py
+++ b/py/src/braintrust/wrappers/claude_agent_sdk/__init__.py
@@ -19,7 +19,7 @@
 
 from braintrust.logger import NOOP_SPAN, current_span, init_logger
 
-from ._wrapper import _create_client_wrapper_class, _create_tool_wrapper_class, _wrap_tool_factory
+from ._wrapper import _create_client_wrapper_class, _create_tool_wrapper_class
 
 
 logger = logging.getLogger(__name__)
@@ -69,7 +69,6 @@ def setup_claude_agent_sdk(
 
         original_client = claude_agent_sdk.ClaudeSDKClient if hasattr(claude_agent_sdk, "ClaudeSDKClient") else None
         original_tool_class = claude_agent_sdk.SdkMcpTool if hasattr(claude_agent_sdk, "SdkMcpTool") else None
-        original_tool_fn = claude_agent_sdk.tool if hasattr(claude_agent_sdk, "tool") else None
 
         if original_client:
             wrapped_client = _create_client_wrapper_class(original_client)
@@ -89,15 +88,6 @@ def setup_claude_agent_sdk(
                     if getattr(module, "SdkMcpTool", None) is original_tool_class:
                         setattr(module, "SdkMcpTool", wrapped_tool_class)
 
-        if original_tool_fn:
-            wrapped_tool_fn = _wrap_tool_factory(original_tool_fn)
-            claude_agent_sdk.tool = wrapped_tool_fn
-
-            for module in list(sys.modules.values()):
-                if module and hasattr(module, "tool"):
-                    if getattr(module, "tool", None) is original_tool_fn:
-                        setattr(module, "tool", wrapped_tool_fn)
-
         return True
     except ImportError:
         # Not installed - this is expected when using auto_instrument()
diff --git a/py/src/braintrust/wrappers/claude_agent_sdk/_wrapper.py b/py/src/braintrust/wrappers/claude_agent_sdk/_wrapper.py
index e6e3d901..66b85a58 100644
--- a/py/src/braintrust/wrappers/claude_agent_sdk/_wrapper.py
+++ b/py/src/braintrust/wrappers/claude_agent_sdk/_wrapper.py
@@ -195,26 +195,6 @@ def __init__(
     return WrappedSdkMcpTool
 
 
-def _wrap_tool_factory(tool_fn: Any) -> Any:
-    """Wrap the tool() factory so decorated handlers inherit the active TOOL span."""
-
-    def wrapped_tool(*args: Any, **kwargs: Any) -> Any:
-        result = tool_fn(*args, **kwargs)
-        if not callable(result):
-            return result
-
-        def wrapped_decorator(handler_fn: Any) -> Any:
-            tool_def = result(handler_fn)
-            if tool_def and hasattr(tool_def, "handler"):
-                tool_name = getattr(tool_def, "name", DEFAULT_TOOL_NAME)
-                tool_def.handler = _wrap_tool_handler(tool_def.handler, tool_name)
-            return tool_def
-
-        return wrapped_decorator
-
-    return wrapped_tool
-
-
 def _wrap_tool_handler(handler: Any, tool_name: Any) -> Any:
     """Wrap a tool handler so nested spans execute under the stream-based TOOL span."""
     if hasattr(handler, "_braintrust_wrapped"):
diff --git a/py/src/braintrust/wrappers/claude_agent_sdk/test_wrapper.py b/py/src/braintrust/wrappers/claude_agent_sdk/test_wrapper.py
index e3d82bfd..02386156 100644
--- a/py/src/braintrust/wrappers/claude_agent_sdk/test_wrapper.py
+++ b/py/src/braintrust/wrappers/claude_agent_sdk/test_wrapper.py
@@ -61,7 +61,6 @@ def memory_logger():
 def _patched_claude_sdk(*, wrap_client: bool = False, wrap_tool_class: bool = False):
     original_client = claude_agent_sdk.ClaudeSDKClient
     original_tool_class = claude_agent_sdk.SdkMcpTool
-    original_tool_fn = claude_agent_sdk.tool
 
     if wrap_client:
         claude_agent_sdk.ClaudeSDKClient = _create_client_wrapper_class(original_client)
@@ -73,7 +72,6 @@ def _patched_claude_sdk(*, wrap_client: bool = False, wrap_tool_class: bool = Fa
     finally:
         claude_agent_sdk.ClaudeSDKClient = original_client
         claude_agent_sdk.SdkMcpTool = original_tool_class
-        claude_agent_sdk.tool = original_tool_fn
 
 
 @pytest.mark.skipif(not CLAUDE_SDK_AVAILABLE, reason="Claude Agent SDK not installed")
@@ -1810,14 +1808,12 @@ async def test_setup_claude_agent_sdk_repro_import_before_setup(memory_logger, m
     assert not memory_logger.pop()
     original_client = claude_agent_sdk.ClaudeSDKClient
     original_tool_class = claude_agent_sdk.SdkMcpTool
-    original_tool_fn = claude_agent_sdk.tool
 
     consumer_module_name = "test_issue7_repro_module"
     consumer_module = types.ModuleType(consumer_module_name)
     consumer_module.ClaudeSDKClient = original_client
     consumer_module.ClaudeAgentOptions = claude_agent_sdk.ClaudeAgentOptions
     consumer_module.SdkMcpTool = original_tool_class
-    consumer_module.tool = original_tool_fn
     monkeypatch.setitem(sys.modules, consumer_module_name, consumer_module)
 
     loop_errors = []
@@ -1827,9 +1823,7 @@ async def test_setup_claude_agent_sdk_repro_import_before_setup(memory_logger, m
         assert setup_claude_agent_sdk(project=PROJECT_NAME, api_key=logger.TEST_API_KEY)
         assert getattr(consumer_module, "ClaudeSDKClient") is not original_client
         assert getattr(consumer_module, "SdkMcpTool") is not original_tool_class
-        assert getattr(consumer_module, "tool") is not original_tool_fn
         assert claude_agent_sdk.SdkMcpTool is not original_tool_class
-        assert claude_agent_sdk.tool is not original_tool_fn
 
         async def main() -> None:
             loop = asyncio.get_running_loop()

From 2a0c034c2caf4dd2b5c4b3b889151cef4d61ce6b Mon Sep 17 00:00:00 2001
From: Abhijeet Prasad <abhijeet@braintrustdata.com>
Date: Wed, 18 Mar 2026 23:54:44 -0400
Subject: [PATCH 03/12] Extract task-event helpers to module-level functions

Step 1 of the Claude Agent SDK instrumentation simplification plan.

Move TaskEventSpanTracker's pure private methods to module-level
functions (_task_span_name, _task_metadata, _task_output) so they can
be reused by the upcoming ContextTracker. The old methods now delegate
to the new functions with one-line bodies.
---
 .../wrappers/claude_agent_sdk/_wrapper.py     | 69 +++++++++++--------
 1 file changed, 40 insertions(+), 29 deletions(-)

diff --git a/py/src/braintrust/wrappers/claude_agent_sdk/_wrapper.py b/py/src/braintrust/wrappers/claude_agent_sdk/_wrapper.py
index 66b85a58..a2b7a5cd 100644
--- a/py/src/braintrust/wrappers/claude_agent_sdk/_wrapper.py
+++ b/py/src/braintrust/wrappers/claude_agent_sdk/_wrapper.py
@@ -580,6 +580,43 @@ def cleanup(self) -> None:
                 state.current_output = None
 
 
+def _task_span_name(message: Any, task_id: str) -> str:
+    return getattr(message, "description", None) or getattr(message, "task_type", None) or f"Task {task_id}"
+
+
+def _task_metadata(message: Any) -> dict[str, Any]:
+    return {
+        k: v
+        for k, v in {
+            "task_id": getattr(message, "task_id", None),
+            "session_id": getattr(message, "session_id", None),
+            "tool_use_id": getattr(message, "tool_use_id", None),
+            "task_type": getattr(message, "task_type", None),
+            "status": getattr(message, "status", None),
+            "last_tool_name": getattr(message, "last_tool_name", None),
+            "usage": getattr(message, "usage", None),
+        }.items()
+        if v is not None
+    }
+
+
+def _task_output(message: Any) -> dict[str, Any] | None:
+    summary = getattr(message, "summary", None)
+    output_file = getattr(message, "output_file", None)
+
+    if summary is None and output_file is None:
+        return None
+
+    return {
+        k: v
+        for k, v in {
+            "summary": summary,
+            "output_file": output_file,
+        }.items()
+        if v is not None
+    }
+
+
 class TaskEventSpanTracker:
     def __init__(self, root_span_export: str, tool_tracker: ToolSpanTracker):
         self._root_span_export = root_span_export
@@ -672,39 +709,13 @@ def _parent_export(self, message: Any) -> str:
         return self._tool_tracker.get_span_export(getattr(message, "tool_use_id", None)) or self._root_span_export
 
     def _span_name(self, message: Any, task_id: str) -> str:
-        return getattr(message, "description", None) or getattr(message, "task_type", None) or f"Task {task_id}"
+        return _task_span_name(message, task_id)
 
     def _metadata(self, message: Any) -> dict[str, Any]:
-        metadata = {
-            k: v
-            for k, v in {
-                "task_id": getattr(message, "task_id", None),
-                "session_id": getattr(message, "session_id", None),
-                "tool_use_id": getattr(message, "tool_use_id", None),
-                "task_type": getattr(message, "task_type", None),
-                "status": getattr(message, "status", None),
-                "last_tool_name": getattr(message, "last_tool_name", None),
-                "usage": getattr(message, "usage", None),
-            }.items()
-            if v is not None
-        }
-        return metadata
+        return _task_metadata(message)
 
     def _output(self, message: Any) -> dict[str, Any] | None:
-        summary = getattr(message, "summary", None)
-        output_file = getattr(message, "output_file", None)
-
-        if summary is None and output_file is None:
-            return None
-
-        return {
-            k: v
-            for k, v in {
-                "summary": summary,
-                "output_file": output_file,
-            }.items()
-            if v is not None
-        }
+        return _task_output(message)
 
     def _should_end(self, message_type: str) -> bool:
         return message_type == MessageClassName.TASK_NOTIFICATION

From ff799d0245311fa8500387a06d0e4105cfa7eb29 Mon Sep 17 00:00:00 2001
From: Abhijeet Prasad <abhijeet@braintrustdata.com>
Date: Wed, 18 Mar 2026 23:55:52 -0400
Subject: [PATCH 04/12] Add cleanup_context / cleanup_all to ToolSpanTracker
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Step 2 of the Claude Agent SDK instrumentation simplification plan.

Add two focused cleanup methods to ToolSpanTracker:
- cleanup_context(): close tool spans for one subagent context, with
  an exclude set for live Agent spans
- cleanup_all(): close all remaining active spans at end-of-stream

The existing cleanup() method is left untouched — the old
receive_response loop still calls it with its current signature.
---
 .../wrappers/claude_agent_sdk/_wrapper.py     | 24 +++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/py/src/braintrust/wrappers/claude_agent_sdk/_wrapper.py b/py/src/braintrust/wrappers/claude_agent_sdk/_wrapper.py
index a2b7a5cd..3521be02 100644
--- a/py/src/braintrust/wrappers/claude_agent_sdk/_wrapper.py
+++ b/py/src/braintrust/wrappers/claude_agent_sdk/_wrapper.py
@@ -323,6 +323,30 @@ def cleanup(
                     continue
             self._end_tool_span(tool_use_id, end_time=end_time)
 
+    def cleanup_context(
+        self,
+        parent_tool_use_id: str | None,
+        *,
+        end_time: float | None = None,
+        exclude_ids: frozenset[str] = frozenset(),
+    ) -> None:
+        """Close tool spans belonging to one subagent context.
+
+        Skips any span whose tool_use_id is in exclude_ids (live Agent spans).
+        Called before starting a new LLM span for that context.
+        """
+        for tool_use_id in list(self._active_spans):
+            if tool_use_id in exclude_ids:
+                continue
+            if self._active_spans[tool_use_id].parent_tool_use_id != parent_tool_use_id:
+                continue
+            self._end_tool_span(tool_use_id, end_time=end_time)
+
+    def cleanup_all(self, end_time: float | None = None) -> None:
+        """Close all remaining active spans. Called at end-of-stream."""
+        for tool_use_id in list(self._active_spans):
+            self._end_tool_span(tool_use_id, end_time=end_time)
+
     @property
     def has_active_spans(self) -> bool:
         return bool(self._active_spans)

From e198c1b32f8d3c78714ccbd9235ebdbc3926dc79 Mon Sep 17 00:00:00 2001
From: Abhijeet Prasad <abhijeet@braintrustdata.com>
Date: Thu, 19 Mar 2026 00:03:03 -0400
Subject: [PATCH 05/12] Migrate mid-stream cleanup call to cleanup_context

Step 3 of the Claude Agent SDK instrumentation simplification plan.

- Replace the mid-stream tool_tracker.cleanup(end_time=...,
  exclude_tool_use_ids=..., only_parent_tool_use_id=...) call in
  receive_response with tool_tracker.cleanup_context(...)
- Simplify old cleanup() to delegate to cleanup_all()
- Update two unit tests that called cleanup(only_parent_tool_use_id=...)
  to use cleanup_context() instead
---
 .../claude_agent_sdk/INSTRUMENTATION.md       | 532 ++++++++++++++++
 .../wrappers/claude_agent_sdk/PLAN.md         | 570 ++++++++++++++++++
 .../claude_agent_sdk/SIMPLIFICATION.md        | 366 +++++++++++
 .../wrappers/claude_agent_sdk/_wrapper.py     |  22 +-
 .../wrappers/claude_agent_sdk/test_wrapper.py |   8 +-
 5 files changed, 1476 insertions(+), 22 deletions(-)
 create mode 100644 py/src/braintrust/wrappers/claude_agent_sdk/INSTRUMENTATION.md
 create mode 100644 py/src/braintrust/wrappers/claude_agent_sdk/PLAN.md
 create mode 100644 py/src/braintrust/wrappers/claude_agent_sdk/SIMPLIFICATION.md

diff --git a/py/src/braintrust/wrappers/claude_agent_sdk/INSTRUMENTATION.md b/py/src/braintrust/wrappers/claude_agent_sdk/INSTRUMENTATION.md
new file mode 100644
index 00000000..1a7e0b10
--- /dev/null
+++ b/py/src/braintrust/wrappers/claude_agent_sdk/INSTRUMENTATION.md
@@ -0,0 +1,532 @@
+# Claude Agent SDK Instrumentation — Deep Dive
+
+## Overview
+
+This document explains how the Braintrust wrapper instruments the Claude Agent SDK: how the monkeypatch works, what data structures are used, and how they collaborate to produce a correct span tree even when multiple subagents run concurrently on a single interleaved message stream.
+
+---
+
+## 1. The Monkeypatch
+
+`setup_claude_agent_sdk()` (in `__init__.py`) patches three things in the `claude_agent_sdk` module **and** in every already-imported module in `sys.modules`:
+
+```
+claude_agent_sdk.ClaudeSDKClient   →  WrappedClaudeSDKClient   (via _create_client_wrapper_class)
+claude_agent_sdk.SdkMcpTool        →  WrappedSdkMcpTool        (via _create_tool_wrapper_class)
+claude_agent_sdk.tool              →  wrapped_tool_fn           (via _wrap_tool_factory)
+```
+
+All three wrappers are **generated at call time** via factory functions — they dynamically create new classes/functions that subclass or close over the originals. The `sys.modules` sweep handles the case where user code has already done `from claude_agent_sdk import ClaudeSDKClient` before calling `setup_claude_agent_sdk`.
+
+```
+User Code                       Braintrust Wrapper              Original SDK
+─────────                       ──────────────────              ────────────
+ClaudeSDKClient(...)       →    WrappedClaudeSDKClient(...)  →  original.__init__(...)
+  client.query(...)         →    captures prompt + start_time →  original.query(...)
+  client.receive_response() →    starts TASK span             →  original.receive_response()
+                                 processes every message
+                                 creates LLM/TOOL spans
+                                 yields message to user
+```
+
+`WrappedClaudeSDKClient` extends `Wrapper` (a base that proxies attribute access to the inner client), so any attributes the user accesses that aren't explicitly overridden fall through transparently to the original.
+
+### 1b. Why `SdkMcpTool` and `tool` are wrapped separately from `ClaudeSDKClient`
+
+`ClaudeSDKClient` is responsible for the **stream side**: it observes every message,
+creates TOOL spans for each `ToolUseBlock`, and stores them in `ToolSpanTracker`.
+At that point the spans exist but are **not yet the active context** — the tool
+handler hasn't run yet.
+
+`SdkMcpTool` and `tool` are responsible for the **handler side**: they intercept
+tool handler registration at decoration/instantiation time and wrap every handler
+via `_wrap_tool_handler`. When the Claude SDK later calls the handler (through its
+own internal machinery, not Braintrust code), the wrapper fires first:
+
+```python
+async def wrapped_handler(args):
+    active_tool_span = _activate_tool_span_for_handler(tool_name, args)
+
+    if not active_tool_span.has_span:
+        # No stream active — create a standalone TOOL span as a fallback
+        with start_span(name=str(tool_name), type=TOOL, input=args) as span:
+            result = await handler(args)
+            span.log(output=result)
+            return result
+
+    try:
+        return await handler(args)     # ← user code runs here, under the span
+    except Exception as exc:
+        active_tool_span.log_error(exc)
+        raise
+    finally:
+        active_tool_span.release()     # span.unset_current()
+```
+
+`_activate_tool_span_for_handler` reads the thread-local `ToolSpanTracker`, finds
+the pre-created span by `(tool_name, args)`, and calls `span.set_current()` —
+making that span the active context for the duration of the call. Any span the user
+creates *inside* their handler therefore nests under the correct TOOL span
+automatically.
+
+**The two-phase handoff in full:**
+
+```
+receive_response() — Braintrust controls          Claude SDK internals — Braintrust does NOT control
+──────────────────────────────────────            ──────────────────────────────────────────────────
+AssistantMessage arrives
+  → start_tool_spans()
+    → create TOOL span            ─── stored in ToolSpanTracker via thread-local ──→
+    → store in _active_spans
+
+                                                  SDK calls tool.handler(args)
+                                                    → _wrap_tool_handler fires
+                                                    → reads thread-local tracker
+                                                    → acquires + activates TOOL span
+                                                    → user handler runs nested under it
+                                                    → span released (unset_current)
+
+UserMessage arrives (ToolResultBlock)
+  → finish_tool_spans()
+    → log output + end span
+```
+
+**Without the `SdkMcpTool`/`tool` wrappers**, step 2 never happens. The pre-created
+spans sit in the tracker with their context never activated, and any spans created
+inside user handler code have no TOOL span parent — they would float up to the TASK
+span or be rootless.
+
+**The fallback path** (no stream active) covers two practical cases:
+- A tool handler called directly in a unit test.
+- A tool handler invoked before or after a `receive_response()` session.
+
+In both cases `_activate_tool_span_for_handler` finds no `ToolSpanTracker` on the
+thread-local and returns `_NOOP_ACTIVE_TOOL_SPAN`, triggering the `with start_span`
+fallback branch which creates and closes a standalone TOOL span for that single
+invocation.
+
+---
+
+## 2. The SDK Message Stream
+
+The Claude Agent SDK streams messages from a subprocess over a JSON protocol. Every message is surfaced on a single `async for message in client.receive_response()` iterator. When subagents run concurrently, their messages **interleave** on this one stream:
+
+```
+─────── Single stream (time flows down) ────────────────────────────────────────────────
+ AssistantMessage  (orchestrator: calls Agent A and Agent B)
+ SystemMessage     (TaskStarted for task A)
+ SystemMessage     (TaskStarted for task B)
+ AssistantMessage  (subagent A's LLM turn: calls Bash)    ← parent_tool_use_id = "call-A"
+ AssistantMessage  (subagent B's LLM turn: calls Read)    ← parent_tool_use_id = "call-B"
+ UserMessage       (Bash result for A)                    ← parent_tool_use_id = "call-A"
+ UserMessage       (Read result for B)                    ← parent_tool_use_id = "call-B"
+ SystemMessage     (TaskNotification for task A — done)
+ SystemMessage     (TaskNotification for task B — done)
+ ResultMessage     (final usage)
+────────────────────────────────────────────────────────────────────────────────────────
+```
+
+The key field is `parent_tool_use_id`: every message from a subagent carries the `tool_use_id` of the `Agent` tool call that spawned it. Orchestrator messages have `parent_tool_use_id = None`.
+
+---
+
+## 3. The Span Hierarchy Being Built
+
+```
+Claude Agent  [TASK]
+├── anthropic.messages.create  [LLM]   ← orchestrator's turn
+│   ├── Agent  [TOOL]                  ← "Agent" tool call → spawns subagent A
+│   └── Agent  [TOOL]                  ← "Agent" tool call → spawns subagent B
+├── Task A  [TASK]
+│   ├── anthropic.messages.create  [LLM]   ← subagent A turn 1
+│   │   └── Bash  [TOOL]
+│   └── anthropic.messages.create  [LLM]   ← subagent A turn 2
+│       └── Read  [TOOL]
+└── Task B  [TASK]
+    ├── anthropic.messages.create  [LLM]   ← subagent B turn 1
+    │   └── Bash  [TOOL]
+    └── anthropic.messages.create  [LLM]   ← subagent B turn 2
+        └── Read  [TOOL]
+```
+
+Three independent trackers collaborate to build this tree. They are described below.
+
+---
+
+## 4. Data Structures
+
+### 4a. `ParsedToolName` (frozen dataclass)
+
+```python
+@dataclasses.dataclass(frozen=True)
+class ParsedToolName:
+    raw_name: str        # "mcp__server__remote_tool"
+    display_name: str    # "remote_tool"   (or same as raw_name for non-MCP)
+    is_mcp: bool         # True
+    mcp_server: str|None # "server"
+```
+
+MCP tools from the Claude SDK have names like `mcp__myserver__some_tool`. `_parse_tool_name()` splits on `__` to extract `server` and `some_tool`, giving the span a clean display name and storing MCP metadata.
+
+---
+
+### 4b. `_ActiveToolSpan` (dataclass)
+
+One instance per live tool call. Lives in `ToolSpanTracker._active_spans` keyed by `tool_use_id`.
+
+```
+_ActiveToolSpan
+┌─────────────────────────────────────────────────────┐
+│ span              : the Braintrust span object       │
+│ raw_name          : "mcp__server__tool"              │
+│ display_name      : "tool"                           │
+│ input             : {"arg": "val"}  ← from SDK block │
+│ tool_use_id       : "toolu_abc123"                   │
+│ parent_tool_use_id: "toolu_agent_a" ← which subagent │
+│ handler_active    : False ← True while handler runs  │
+└─────────────────────────────────────────────────────┘
+```
+
+`activate()` sets `handler_active=True` and calls `span.set_current()` — making the Braintrust span the active context so any `start_span()` inside a tool handler automatically nests under it. `release()` undoes this.
+
+There is also `_NoopActiveToolSpan` — a sentinel used when no matching span is found. It has the same interface but does nothing, so `_wrap_tool_handler` can call `.activate()` / `.release()` unconditionally without null checks.
+
+---
+
+### 4c. `ToolSpanTracker`
+
+This is the most complex tracker. It manages all live tool spans across all subagent contexts.
+
+```
+ToolSpanTracker
+┌───────────────────────────────────────────────────────────────────────────────┐
+│                                                                               │
+│  _active_spans: dict[tool_use_id → _ActiveToolSpan]                          │
+│  ┌──────────────┐  ┌──────────────┐  ┌──────────────┐                        │
+│  │ "toolu_a1"   │  │ "toolu_b1"   │  │ "toolu_c1"   │  ...                   │
+│  │ Bash         │  │ Bash         │  │ remote_tool  │                        │
+│  │ parent=A     │  │ parent=B     │  │ parent=C     │                        │
+│  └──────────────┘  └──────────────┘  └──────────────┘                        │
+│                                                                               │
+│  _dispatch_queues: dict[(tool_name, input_sig) → deque[tool_use_id]]         │
+│  ┌──────────────────────────────────────────────────┐                        │
+│  │ ("Bash", '{"cmd":"echo"}') → deque["a1", "b1"]   │  ← FIFO               │
+│  │ ("Read", '{"path":"/f"}')  → deque["a2", "b2"]   │                        │
+│  └──────────────────────────────────────────────────┘                        │
+│                                                                               │
+│  _pending_task_link_tool_use_ids: set[tool_use_id]                           │
+│  { "toolu_agent_a", "toolu_agent_b" }  ← "Agent" calls awaiting TaskStarted  │
+│                                                                               │
+└───────────────────────────────────────────────────────────────────────────────┘
+```
+
+**Lifecycle of a tool span through `ToolSpanTracker`:**
+
+```
+AssistantMessage arrives with ToolUseBlock
+        │
+        ▼
+  start_tool_spans()
+  ├── creates span with parent = current LLM span export
+  ├── inserts into _active_spans[tool_use_id]
+  ├── enqueues tool_use_id into _dispatch_queues[(name, input)]
+  └── if name == "Agent": adds to _pending_task_link_tool_use_ids
+
+Tool handler is called (by Claude SDK)
+        │
+        ▼
+  _activate_tool_span_for_handler()
+  ├── reads _thread_local.tool_span_tracker
+  └── calls tracker.acquire_span_for_handler(name, args)
+      ├── find candidates: active spans with matching name, not handler_active
+      ├── _match_via_dispatch_queue()   ← try FIFO first
+      │   └── pop from deque, return matching candidate
+      ├── fallback: _match_tool_span_for_handler()  ← exact input match
+      └── matched_span.activate()  → handler_active=True, set_current()
+
+Tool handler finishes / UserMessage with ToolResultBlock arrives
+        │
+        ▼
+  finish_tool_spans()
+  └── _end_tool_span(tool_use_id, tool_result_block=block)
+      ├── pop from _active_spans
+      ├── remove from _dispatch_queues
+      ├── log output from ToolResultBlock
+      └── span.end()
+```
+
+**`_dispatch_queues` — the FIFO disambiguator:**
+
+When subagent A and subagent B both call `Bash` with `{"cmd": "echo hi"}`, two identical `_ActiveToolSpan` entries exist. Without disambiguation, `acquire_span_for_handler` can't tell which handler invocation should own which span. The dispatch queue solves this by recording creation order:
+
+```
+Creation order:         Queue state:
+  span "bash-A" added   →  ("Bash", '{"cmd":"echo hi"}')  deque: ["bash-A"]
+  span "bash-B" added   →  ("Bash", '{"cmd":"echo hi"}')  deque: ["bash-A", "bash-B"]
+
+Handler for A fires:    pop "bash-A"  →  give it bash-A span  ✓
+Handler for B fires:    pop "bash-B"  →  give it bash-B span  ✓
+```
+
+**`cleanup()` — the scoped closer:**
+
+```python
+def cleanup(self, end_time=None, exclude_tool_use_ids=None, only_parent_tool_use_id=_UNSET_PARENT)
+```
+
+Three filter modes:
+- No filters → close all active spans (called at the very end of `receive_response`).
+- `exclude_tool_use_ids` → skip "Agent" spans still waiting for their `TaskStarted` event.
+- `only_parent_tool_use_id` → **only** close spans belonging to a specific subagent context. This is called every time an `AssistantMessage` arrives, scoped to that message's `parent_tool_use_id`, so it never accidentally closes another subagent's still-open tool spans.
+
+---
+
+### 4d. `LLMSpanTracker._SubagentState` (inner dataclass)
+
+One per subagent context. `None` key = orchestrator.
+
+```
+_SubagentState
+┌──────────────────────────────────────────────────────────────────┐
+│ current_span        : the open LLM span (or None)                │
+│ current_span_export : span.export() string for use as parent ref  │
+│ current_parent_export: parent export used when span was created   │
+│ current_output      : [{"role":"assistant","content":[...]}]      │
+│                       accumulated output so streaming chunks merge│
+│ next_start_time     : float timestamp — when the next LLM call    │
+│                       will start (set after tool results arrive)  │
+└──────────────────────────────────────────────────────────────────┘
+```
+
+`next_start_time` is the key to non-overlapping sequential spans within one subagent. The sequence is:
+
+```
+UserMessage (tool results arrive)
+    → mark_next_llm_start()  ← stamps the time NOW
+
+AssistantMessage (next LLM response)
+    → start_llm_span()
+        → resolved_start_time = next_start_time  (the stamp from above)
+        → current_span.end(end_time=resolved_start_time)  ← previous span ends HERE
+        → create new span with start = resolved_start_time
+        → next_start_time = None
+```
+
+This ensures the outgoing LLM span ends exactly when the next one begins — no gap, no overlap — even though the Python code observing the stream sees them arrive sequentially.
+
+---
+
+### 4e. `LLMSpanTracker`
+
+Manages a `_SubagentState` for every subagent context, plus an `_active_context` pointer that says "which state should the next operation touch":
+
+```
+LLMSpanTracker
+┌───────────────────────────────────────────────────────────────────────────────┐
+│                                                                               │
+│  _active_context: "call-A"   ← set by set_context() on each AssistantMessage │
+│                                                                               │
+│  _states: dict[parent_tool_use_id → _SubagentState]                          │
+│  ┌──────────────────┐  ┌──────────────────┐  ┌──────────────────┐            │
+│  │ None (orchestr.) │  │ "call-A"         │  │ "call-B"         │            │
+│  │ next_start=t0    │  │ current_span=s1  │  │ current_span=s2  │            │
+│  │ current_span=s0  │  │ next_start=None  │  │ next_start=t1    │            │
+│  └──────────────────┘  └──────────────────┘  └──────────────────┘            │
+│                                                                               │
+└───────────────────────────────────────────────────────────────────────────────┘
+```
+
+**Context routing via `_get_state`:**
+
+```python
+def _get_state(self, parent_tool_use_id=_UNSET_PARENT):
+    key = self._active_context if parent_tool_use_id is _UNSET_PARENT else parent_tool_use_id
+    ...
+```
+
+- Called with `_UNSET_PARENT` (the default) → uses `_active_context`, whichever subagent was most recently set via `set_context()`.
+- Called with an explicit value (e.g. from `mark_next_llm_start(user_parent)`) → routes directly to that subagent's state regardless of `_active_context`.
+
+This is why `_UNSET_PARENT = object()` exists — it is a sentinel that can be distinguished from `None`, which is a valid key meaning "orchestrator".
+
+**`mark_next_llm_start()` edge case:**
+
+UserMessages from the Claude SDK sometimes don't carry `parent_tool_use_id` even when they belong to a subagent context. The special-case logic handles this:
+
+```python
+def mark_next_llm_start(self, parent_tool_use_id=_UNSET_PARENT):
+    if parent_tool_use_id is None and self._active_context is not None:
+        parent_tool_use_id = _UNSET_PARENT  # fall back to active context
+    self._get_state(parent_tool_use_id).next_start_time = time.time()
+```
+
+If the UserMessage says `parent_tool_use_id=None` (field absent or None) but `_active_context` is set (we are processing a subagent's turn), treat it as "active context" rather than routing to the orchestrator state.
+
+---
+
+### 4f. `TaskEventSpanTracker`
+
+Manages TASK spans for subagent tasks, driven by `SystemMessage` subtypes.
+
+```
+TaskEventSpanTracker
+┌───────────────────────────────────────────────────────────────────────────────┐
+│ _root_span_export        : export of the top-level "Claude Agent" TASK span  │
+│ _tool_tracker            : ref to ToolSpanTracker (to get Agent span export) │
+│                                                                               │
+│ _active_spans            : dict[task_id → span]                              │
+│ ┌──────────────┐  ┌──────────────┐                                           │
+│ │ "task_a"     │  │ "task_b"     │  ...                                      │
+│ │ span=...     │  │ span=...     │                                            │
+│ └──────────────┘  └──────────────┘                                           │
+│                                                                               │
+│ _task_span_by_tool_use_id: dict[agent_tool_use_id → span]                    │
+│ ┌─────────────────────────────────────────────────────┐                      │
+│ │ "toolu_agent_a" → task-A span                       │                      │
+│ │ "toolu_agent_b" → task-B span                       │                      │
+│ └─────────────────────────────────────────────────────┘                      │
+│                                                                               │
+│ _active_task_order       : ["task_a", "task_b"]  ← insertion order          │
+└───────────────────────────────────────────────────────────────────────────────┘
+```
+
+**Lifecycle:**
+
+- `TaskStartedMessage` → create a TASK span. Parent is the `Agent` tool span for this task (looked up via `_tool_tracker.get_span_export(message.tool_use_id)`), falling back to the root span. Also calls `_tool_tracker.mark_task_started(tool_use_id)`, removing the agent tool_use_id from `_pending_task_link_tool_use_ids`, which tells `cleanup()` it is now safe to close that `Agent` span.
+- `TaskProgressMessage` → log metadata/output updates to the existing TASK span.
+- `TaskNotificationMessage` → end the TASK span and remove it from both dicts.
+
+**`parent_export_for_message()`** finds the right parent for a subagent's LLM span given an `AssistantMessage`:
+
+1. If `parent_tool_use_id` is set, look up `_task_span_by_tool_use_id[parent_tool_use_id]` — return that task span as parent. ✓
+2. Else if the message itself contains an `Agent` ToolUseBlock (orchestrator calling a subagent), use the top-level span as parent (not the most recently opened task).
+3. Else fall back to the latest open task span in `_active_task_order`.
+
+---
+
+## 5. Thread-Local: Bridging the Stream to Tool Handlers
+
+The trickiest part is that tool handlers are called **by the Claude SDK** — not directly by Braintrust code. There is no way to pass context as a function argument. The solution is a thread-local:
+
+```python
+_thread_local = threading.local()
+```
+
+At the start of `receive_response()`:
+```python
+_thread_local.tool_span_tracker = tool_tracker
+```
+
+Inside every wrapped tool handler:
+```python
+def _activate_tool_span_for_handler(tool_name, args):
+    tool_span_tracker = getattr(_thread_local, "tool_span_tracker", None)
+    if tool_span_tracker is None:
+        return _NOOP_ACTIVE_TOOL_SPAN  # no tracing session active
+    return tool_span_tracker.acquire_span_for_handler(tool_name, args) or _NOOP_ACTIVE_TOOL_SPAN
+```
+
+This means:
+- One `receive_response()` session running on a thread → that thread's tool handlers find their tracker.
+- If a tool is called outside of a `receive_response()` session → returns `_NOOP_ACTIVE_TOOL_SPAN`, tracing is skipped gracefully.
+- The thread-local is cleaned up in the `finally` block of `receive_response()`.
+
+```
+Thread T1: receive_response() starts
+  _thread_local.tool_span_tracker = tracker_T1
+
+  Claude SDK calls tool handler "Bash" on T1
+    → _activate_tool_span_for_handler reads _thread_local.tool_span_tracker
+    → gets tracker_T1 → acquires correct span → handler runs nested under it
+
+  receive_response() finally:
+    del _thread_local.tool_span_tracker
+```
+
+---
+
+## 6. Full Message Loop
+
+How every message type affects each tracker:
+
+```
+Message arrives from SDK
+│
+├── AssistantMessage  (parent_tool_use_id = X)
+│   ├── llm_tracker.set_context(X)              → route all LLM ops to subagent X's state
+│   ├── if current LLM span + active tool spans:
+│   │   tool_tracker.cleanup(                   → close only X's dangling tool spans
+│   │       end_time=next_start_time,           → timed to the gap before this LLM span
+│   │       exclude=active_subagent_ids,        → leave "Agent" spans still open
+│   │       only_parent=X)                      → don't touch other subagents' tool spans
+│   ├── task_event_span_tracker
+│   │       .parent_export_for_message()        → find which TASK span is the parent
+│   ├── llm_tracker.start_llm_span(...)         → end previous span for X; start new one
+│   └── tool_tracker.start_tool_spans(...)      → open tool spans for any ToolUseBlocks
+│
+├── UserMessage  (parent_tool_use_id = X)
+│   ├── tool_tracker.finish_tool_spans(...)     → close tool spans with output from ToolResultBlocks
+│   └── if has_tool_results:
+│       llm_tracker.mark_next_llm_start(X)     → stamp "next LLM for X starts now"
+│
+├── ResultMessage
+│   ├── llm_tracker.set_context(None)           → route to orchestrator state
+│   └── llm_tracker.log_usage(...)              → attach token usage to orchestrator LLM span
+│
+└── SystemMessage / TaskStarted / TaskProgress / TaskNotification
+    └── task_event_span_tracker.process(...)    → create / update / end TASK spans
+```
+
+---
+
+## 7. End-to-End Example: Two Concurrent Subagents
+
+Walkthrough of exactly what the three trackers look like at each step for the `test_interleaved_subagent_tool_output_preserved` scenario:
+
+```
+Stream event                         ToolSpanTracker._active_spans    LLMTracker._states           TaskEventSpanTracker
+────────────────────────────────     ────────────────────────────     ─────────────────────        ────────────────────
+[1] AssistantMessage(parent=None)    {}                               {None: {span=LLM-0}}         {}
+    orchestrator calls Agent(α), Agent(β)
+    after start_tool_spans:
+                                     {"call-α": Agent-span-α,
+                                      "call-β": Agent-span-β}
+                                     pending: {"call-α", "call-β"}
+
+[2] TaskStartedMessage(task=alpha)   pending: {"call-β"}             (unchanged)                  {"alpha": Task-α (parent=Agent-α)}
+[3] TaskStartedMessage(task=beta)    pending: {}                     (unchanged)                  {"alpha": Task-α, "beta": Task-β}
+
+[4] AssistantMessage(parent=call-α)  (subagent alpha's LLM turn: Bash call)
+    set_context("call-α")
+    cleanup(only_parent="call-α")    → closes nothing (α has no old tool spans)
+    start_llm_span                   (unchanged)                     {"call-α": {span=LLM-α}}
+    start_tool_spans("bash-1")       {"call-α": Bash-span (parent=LLM-α),
+                                      "call-β": Agent-span-β}
+
+[5] AssistantMessage(parent=call-β)  (subagent beta's LLM turn: Read call)
+    set_context("call-β")
+    cleanup(only_parent="call-β")    → closes nothing (β has no old tool spans)
+                                     Bash-span is NOT closed ← key fix
+    start_llm_span                   (unchanged)                     {"call-β": {span=LLM-β}}
+    start_tool_spans("read-1")       {"call-α": Bash-span (still open!),
+                                      "call-β": Read-span (parent=LLM-β)}
+
+[6] UserMessage(ToolResult bash-1="alpha_file_contents", parent=call-α)
+    finish_tool_spans                Bash-span.log(output), Bash-span.end()
+    mark_next_llm_start("call-α")                                    {call-α: {next_start=now}}
+
+[7] UserMessage(ToolResult read-1="beta_file_contents", parent=call-β)
+    finish_tool_spans                Read-span.log(output), Read-span.end()
+    mark_next_llm_start("call-β")                                    {call-β: {next_start=now}}
+
+[8] ResultMessage
+    set_context(None)
+    log_usage                                                         LLM-0.log(tokens)
+
+finally:
+    task_event_span_tracker.cleanup()  → end Task-α, Task-β
+    tool_tracker.cleanup()             → end Agent-α, Agent-β (if still open)
+    llm_tracker.cleanup()              → end LLM-α, LLM-β, LLM-0
+```
+
+At step [5], the old code called `cleanup()` globally, ending Bash-span before step [6] could record its output. The `only_parent_tool_use_id="call-β"` filter introduced by the fix prevents that — Bash-span survives to receive its result.
diff --git a/py/src/braintrust/wrappers/claude_agent_sdk/PLAN.md b/py/src/braintrust/wrappers/claude_agent_sdk/PLAN.md
new file mode 100644
index 00000000..f88a4014
--- /dev/null
+++ b/py/src/braintrust/wrappers/claude_agent_sdk/PLAN.md
@@ -0,0 +1,570 @@
+# Simplification Plan: Claude Agent SDK Instrumentation
+
+Replace `LLMSpanTracker` + `TaskEventSpanTracker` with a single `ContextTracker`
+class that consumes the raw SDK message stream and owns all span bookkeeping.
+See `SIMPLIFICATION.md` for the full rationale.
+
+**Unchanged:** `WrappedSdkMcpTool`, `_wrap_tool_handler`,
+`_activate_tool_span_for_handler`, `_thread_local`, `_dispatch_queues`,
+`next_llm_start` stamping, test cassettes.
+
+---
+
+## Target Design
+
+### `_AgentContext`
+
+One instance per subagent context, keyed by `parent_tool_use_id` (`None` =
+orchestrator).
+
+```python
+@dataclasses.dataclass
+class _AgentContext:
+    llm_span: Any | None = None          # current open LLM span
+    llm_output: list[dict[str, Any]] | None = None  # accumulated output for merge path
+    next_llm_start: float | None = None  # timestamp from tool results
+    task_span: Any | None = None         # TASK span for this subagent
+    task_confirmed: bool = False         # True after TaskStartedMessage
+```
+
+Three fields dropped vs the old trackers:
+- `llm_span_export` → derived: `ctx.llm_span.export() if ctx.llm_span else None`
+- `llm_parent_export` → redundant: consecutive merges only happen in the
+  orchestrator context where the parent never changes (see SIMPLIFICATION.md §3b)
+- `task_id` → written to metadata at creation, never read back
+
+### `ContextTracker` — public API
+
+```python
+class ContextTracker:
+    def __init__(self, root_span, prompt, query_start_time=None):
+        self._root_span = root_span
+        self._root_span_export = root_span.export()
+        self._prompt = prompt
+        self._tool_tracker = ToolSpanTracker()        # private, also set on _thread_local
+        self._contexts: dict[str | None, _AgentContext] = {
+            None: _AgentContext(next_llm_start=query_start_time)
+        }
+        self._active_key: str | None = None           # most recent parent_tool_use_id
+        self._task_order: list[str | None] = []       # insertion-order for parent fallback
+        self._final_results: list[dict[str, Any]] = []
+        self._task_events: list[dict[str, Any]] = []
+        _thread_local.tool_span_tracker = self._tool_tracker
+
+    def add(self, message) -> None:
+        """Dispatch one SDK message to the appropriate handler."""
+        message_type = type(message).__name__
+        if message_type == MessageClassName.ASSISTANT:
+            self._handle_assistant(message)
+        elif message_type == MessageClassName.USER:
+            self._handle_user(message)
+        elif message_type == MessageClassName.RESULT:
+            self._handle_result(message)
+        elif message_type in SYSTEM_MESSAGE_TYPES:
+            self._handle_system(message)
+
+    def log_output(self) -> None:
+        if self._final_results:
+            self._root_span.log(output=self._final_results[-1])
+
+    def log_tasks(self) -> None:
+        if self._task_events:
+            self._root_span.log(metadata={"task_events": self._task_events})
+
+    def cleanup(self) -> None:
+        for ctx in self._contexts.values():
+            if ctx.llm_span:
+                ctx.llm_span.end()
+                ctx.llm_span = None
+            if ctx.task_span:
+                ctx.task_span.end()
+                ctx.task_span = None
+        self._task_order.clear()
+        self._tool_tracker.cleanup_all()
+        if hasattr(_thread_local, "tool_span_tracker"):
+            delattr(_thread_local, "tool_span_tracker")
+```
+
+### `ContextTracker` — internal handlers
+
+#### `_handle_assistant`
+
+Called on each `AssistantMessage`. This is the most complex handler because it
+orchestrates tool cleanup, LLM span creation/merge, tool span creation, and
+agent context pre-registration — all scoped to the correct subagent context.
+
+Corresponds to the `AssistantMessage` branch of the current `receive_response`
+loop, which coordinates across all three old trackers.
+
+```python
+def _handle_assistant(self, message: Any) -> None:
+    incoming_parent = getattr(message, "parent_tool_use_id", None)
+    self._active_key = incoming_parent
+    ctx = self._get_context(incoming_parent)
+
+    # 1. Close dangling tool spans from the previous turn in this context.
+    #    Skip Agent tool spans that are still live (pending or task running).
+    #    Replaces: tool_tracker.cleanup(end_time=..., exclude_tool_use_ids=...,
+    #              only_parent_tool_use_id=...)
+    if ctx.llm_span and self._tool_tracker.has_active_spans:
+        self._tool_tracker.cleanup_context(
+            incoming_parent,
+            end_time=ctx.next_llm_start or time.time(),
+            exclude_ids=self._live_agent_tool_use_ids(),
+        )
+
+    # 2. Resolve LLM span parent, then create or merge.
+    #    Replaces: task_event_span_tracker.parent_export_for_message(...)
+    #              + llm_tracker.start_llm_span(...)
+    parent_export = self._llm_parent_for_message(message)
+    final_content, extended = self._start_or_merge_llm_span(message, parent_export, ctx)
+
+    # 3. Open TOOL spans for tool calls in this message (parent = LLM span).
+    #    Replaces: tool_tracker.start_tool_spans(message, llm_tracker.current_span_export)
+    llm_export = ctx.llm_span.export() if ctx.llm_span else None
+    self._tool_tracker.start_tool_spans(message, llm_export)
+
+    # 4. Pre-create contexts for Agent tool calls so cleanup_context will
+    #    skip them before their TaskStartedMessage arrives.
+    #    Replaces: tool_tracker._pending_task_link_tool_use_ids.add(...)
+    self._register_pending_agent_contexts(message)
+
+    # 5. Accumulate conversation history.
+    if final_content:
+        if (extended
+                and self._final_results
+                and self._final_results[-1].get("role") == "assistant"):
+            self._final_results[-1] = final_content
+        else:
+            self._final_results.append(final_content)
+```
+
+#### `_handle_user`
+
+Called on each `UserMessage`. Finishes tool spans that have results, serializes
+content for conversation history, and stamps `next_llm_start` on the correct
+context.
+
+The context resolution here replaces the `_UNSET_PARENT` sentinel: if the
+`UserMessage` has no `parent_tool_use_id`, we use `_active_key` (the most
+recently seen `AssistantMessage`'s context) instead of falling back inside the
+tracker.
+
+```python
+def _handle_user(self, message: Any) -> None:
+    self._tool_tracker.finish_tool_spans(message)
+    has_tool_results = False
+    if hasattr(message, "content"):
+        has_tool_results = any(
+            type(b).__name__ == BlockClassName.TOOL_RESULT for b in message.content
+        )
+        content = _serialize_content_blocks(message.content)
+        self._final_results.append({"content": content, "role": "user"})
+    if has_tool_results:
+        user_parent = getattr(message, "parent_tool_use_id", None)
+        resolved_key = user_parent if user_parent is not None else self._active_key
+        self._get_context(resolved_key).next_llm_start = time.time()
+```
+
+#### `_handle_result`
+
+Called on `ResultMessage` (end of stream). Logs usage metrics to the
+orchestrator's LLM span and session metadata to the root span.
+
+```python
+def _handle_result(self, message: Any) -> None:
+    self._active_key = None
+    if hasattr(message, "usage"):
+        usage_metrics = _extract_usage_from_result_message(message)
+        ctx = self._get_context(None)
+        if ctx.llm_span and usage_metrics:
+            ctx.llm_span.log(metrics=usage_metrics)
+    result_metadata = {
+        k: v for k, v in {
+            "num_turns": getattr(message, "num_turns", None),
+            "session_id": getattr(message, "session_id", None),
+        }.items() if v is not None
+    }
+    if result_metadata:
+        self._root_span.log(metadata=result_metadata)
+```
+
+#### `_handle_system`
+
+Called on `SystemMessage` subtypes (TaskStarted, TaskProgress,
+TaskNotification). Resolves the Agent tool span export from `ToolSpanTracker`,
+then delegates to `_process_task_event`.
+
+This keeps `ContextTracker` and `ToolSpanTracker` loosely coupled:
+`ContextTracker` asks for the export string; `ToolSpanTracker` doesn't need a
+back-reference.
+
+```python
+def _handle_system(self, message: Any) -> None:
+    agent_span_export = self._tool_tracker.get_span_export(
+        getattr(message, "tool_use_id", None)
+    )
+    self._process_task_event(message, agent_span_export)
+    self._task_events.append(_serialize_system_message(message))
+```
+
+### `ContextTracker` — internal helpers
+
+#### `_get_context`
+
+Lazy-create `_AgentContext` instances on demand.
+
+```python
+def _get_context(self, key: str | None) -> _AgentContext:
+    ctx = self._contexts.get(key)
+    if ctx is None:
+        ctx = _AgentContext()
+        self._contexts[key] = ctx
+    return ctx
+```
+
+#### `_register_pending_agent_contexts`
+
+Pre-create an `_AgentContext` (with `task_confirmed=False`) for each Agent tool
+call in an `AssistantMessage`. This ensures `_live_agent_tool_use_ids` will
+include them, preventing `cleanup_context` from closing the Agent tool span
+before its `TaskStartedMessage` arrives.
+
+Replaces `ToolSpanTracker._pending_task_link_tool_use_ids.add()`.
+
+```python
+def _register_pending_agent_contexts(self, message: Any) -> None:
+    if not hasattr(message, "content"):
+        return
+    for block in message.content:
+        if (type(block).__name__ == BlockClassName.TOOL_USE
+                and getattr(block, "name", None) == "Agent"):
+            tool_use_id = getattr(block, "id", None)
+            if tool_use_id:
+                self._get_context(str(tool_use_id))
+```
+
+#### `_live_agent_tool_use_ids`
+
+Returns tool_use_ids of Agent spans that must not be closed yet. Includes both
+unconfirmed contexts (pending) and confirmed contexts whose task span is still
+open.
+
+Replaces the union of `task_event_span_tracker.active_tool_use_ids |
+tool_tracker.pending_task_link_tool_use_ids` in the old `receive_response`.
+
+```python
+def _live_agent_tool_use_ids(self) -> frozenset[str]:
+    result: set[str] = set()
+    for key, ctx in self._contexts.items():
+        if key is None:
+            continue
+        if not ctx.task_confirmed or ctx.task_span is not None:
+            result.add(key)
+    return frozenset(result)
+```
+
+#### `_llm_parent_for_message`
+
+Determines the parent span export for an incoming `AssistantMessage`.
+
+Replaces `TaskEventSpanTracker.parent_export_for_message()`. The logic is the
+same but reads directly from `_contexts` instead of a separate
+`_task_span_by_tool_use_id` dict.
+
+```python
+def _llm_parent_for_message(self, message: Any) -> str:
+    parent_tool_use_id = getattr(message, "parent_tool_use_id", None)
+
+    # 1. Subagent message → use that subagent's task span.
+    if parent_tool_use_id is not None:
+        ctx = self._contexts.get(str(parent_tool_use_id))
+        if ctx is not None and ctx.task_span is not None:
+            return ctx.task_span.export()
+
+    # 2. Orchestrator launching Agent tools → root span (not a task span).
+    if _message_starts_subagent_tool(message):
+        return self._root_span_export
+
+    # 3. Fallback: most recently opened task span (orchestrator messages
+    #    that arrive while a subagent task is running).
+    for key in reversed(self._task_order):
+        ctx = self._contexts.get(key)
+        if ctx is not None and ctx.task_span is not None:
+            return ctx.task_span.export()
+
+    # 4. Root span.
+    return self._root_span_export
+```
+
+#### `_start_or_merge_llm_span`
+
+Starts a new LLM span or extends the existing one via merge.
+
+**Merge path:** consecutive `AssistantMessage`s in the same context with no tool
+results between them (`ctx.next_llm_start is None`). This happens in the
+orchestrator context when the model emits a thinking block then a tool-call
+block as two separate messages. Returns `(merged_content, True)`.
+
+**New span path:** ends the previous span at `resolved_start`, opens a fresh
+one. Returns `(final_content, False)`.
+
+The `llm_parent_export` guard from `LLMSpanTracker` is dropped — see
+SIMPLIFICATION.md §3b for why it's always true in practice.
+
+```python
+def _start_or_merge_llm_span(
+    self, message: Any, parent_export: str | None, ctx: _AgentContext,
+) -> tuple[dict[str, Any] | None, bool]:
+    current_message = _serialize_assistant_message(message)
+
+    # Merge path.
+    if ctx.llm_span and ctx.next_llm_start is None and current_message is not None:
+        merged = _merge_assistant_messages(
+            ctx.llm_output[0] if ctx.llm_output else None,
+            current_message,
+        )
+        if merged is not None:
+            ctx.llm_output = [merged]
+            ctx.llm_span.log(output=ctx.llm_output)
+        return merged, True
+
+    # New span path.
+    resolved_start = ctx.next_llm_start or time.time()
+    first_token_time = time.time()
+
+    if ctx.llm_span:
+        ctx.llm_span.end(end_time=resolved_start)
+
+    final_content, span = _create_llm_span_for_messages(
+        [message], self._prompt, self._final_results,
+        parent=parent_export, start_time=resolved_start,
+    )
+    if span is not None:
+        span.log(metrics={"time_to_first_token": max(0.0, first_token_time - resolved_start)})
+    ctx.llm_span = span
+    ctx.llm_output = [final_content] if final_content is not None else None
+    ctx.next_llm_start = None
+    return final_content, False
+```
+
+#### `_process_task_event`
+
+Handles TaskStarted / TaskProgress / TaskNotification system messages.
+
+Key difference from `TaskEventSpanTracker.process()`: contexts are keyed by
+`tool_use_id` (not `task_id`), because that's the same key used everywhere else
+in `ContextTracker`. The old tracker maintained two parallel dicts
+(`_active_spans` keyed by `task_id` and `_task_span_by_tool_use_id` keyed by
+`tool_use_id`); this merges them.
+
+```python
+def _process_task_event(self, message: Any, agent_span_export: str | None) -> None:
+    task_id = getattr(message, "task_id", None)
+    if task_id is None:
+        return
+    task_id = str(task_id)
+    tool_use_id = getattr(message, "tool_use_id", None)
+    tool_use_id_str = str(tool_use_id) if tool_use_id is not None else None
+    ctx = self._get_context(tool_use_id_str)
+    message_type = type(message).__name__
+
+    if ctx.task_span is None:
+        # TaskStartedMessage — open the TASK span.
+        ctx.task_span = start_span(
+            name=_task_span_name(message, task_id),
+            span_attributes={"type": SpanTypeAttribute.TASK},
+            metadata=_task_metadata(message),
+            parent=agent_span_export or self._root_span_export,
+        )
+        ctx.task_confirmed = True
+        self._task_order.append(tool_use_id_str)
+    else:
+        # TaskProgressMessage — update existing task span.
+        update: dict[str, Any] = {}
+        metadata = _task_metadata(message)
+        if metadata:
+            update["metadata"] = metadata
+        output = _task_output(message)
+        if output is not None:
+            update["output"] = output
+        if update:
+            ctx.task_span.log(**update)
+
+    if message_type == MessageClassName.TASK_NOTIFICATION:
+        ctx.task_span.end()
+        ctx.task_span = None
+        self._task_order = [k for k in self._task_order if k != tool_use_id_str]
+```
+
+### `ToolSpanTracker` — new methods
+
+These are added alongside the existing `cleanup()`, which stays untouched until
+Step 5 deletes it.
+
+#### `cleanup_context`
+
+Closes tool spans belonging to one subagent context. Called by
+`ContextTracker._handle_assistant` before starting a new LLM span for that
+context. Skips any span whose `tool_use_id` is in `exclude_ids` (live Agent
+spans).
+
+Replaces the mid-stream `cleanup(end_time=..., exclude_tool_use_ids=...,
+only_parent_tool_use_id=...)` call.
+
+```python
+def cleanup_context(
+    self,
+    parent_tool_use_id: str | None,
+    *,
+    end_time: float | None = None,
+    exclude_ids: frozenset[str] = frozenset(),
+) -> None:
+    for tool_use_id in list(self._active_spans):
+        if tool_use_id in exclude_ids:
+            continue
+        if self._active_spans[tool_use_id].parent_tool_use_id != parent_tool_use_id:
+            continue
+        self._end_tool_span(tool_use_id, end_time=end_time)
+```
+
+#### `cleanup_all`
+
+Closes all remaining active spans. Called at end-of-stream by
+`ContextTracker.cleanup()`.
+
+Replaces the no-args `cleanup()` call in `finally:`.
+
+```python
+def cleanup_all(self, end_time: float | None = None) -> None:
+    for tool_use_id in list(self._active_spans):
+        self._end_tool_span(tool_use_id, end_time=end_time)
+```
+
+### Module-level helpers (extracted from `TaskEventSpanTracker`)
+
+```python
+def _task_span_name(message: Any, task_id: str) -> str:
+    return (getattr(message, "description", None)
+            or getattr(message, "task_type", None)
+            or f"Task {task_id}")
+
+def _task_metadata(message: Any) -> dict[str, Any]:
+    return {k: v for k, v in {
+        "task_id":       getattr(message, "task_id", None),
+        "session_id":    getattr(message, "session_id", None),
+        "tool_use_id":   getattr(message, "tool_use_id", None),
+        "task_type":     getattr(message, "task_type", None),
+        "status":        getattr(message, "status", None),
+        "last_tool_name":getattr(message, "last_tool_name", None),
+        "usage":         getattr(message, "usage", None),
+    }.items() if v is not None}
+
+def _task_output(message: Any) -> dict[str, Any] | None:
+    summary = getattr(message, "summary", None)
+    output_file = getattr(message, "output_file", None)
+    if summary is None and output_file is None:
+        return None
+    return {k: v for k, v in {"summary": summary, "output_file": output_file}.items()
+            if v is not None}
+```
+
+### `receive_response` (final form)
+
+```python
+async def receive_response(self) -> AsyncGenerator[Any, None]:
+    generator = self.__client.receive_response()
+    with start_span(
+        name=CLAUDE_AGENT_TASK_SPAN_NAME,
+        span_attributes={"type": SpanTypeAttribute.TASK},
+        input=self.__last_prompt or None,
+    ) as span:
+        input_needs_update = self.__captured_messages is not None
+        tracker = ContextTracker(span, self.__last_prompt, self.__query_start_time)
+        try:
+            async for message in generator:
+                if input_needs_update:
+                    captured = self.__captured_messages or []
+                    if captured:
+                        span.log(input=captured)
+                    input_needs_update = False
+                tracker.add(message)
+                yield message
+        except asyncio.CancelledError:
+            tracker.log_output()
+        else:
+            tracker.log_output()
+        finally:
+            tracker.log_tasks()
+            tracker.cleanup()
+```
+
+### Span parentage
+
+| Span type | Parent |
+|---|---|
+| Root TASK (`"Claude Agent"`) | Ambient caller context |
+| Subagent TASK | Agent tool span → fallback: root TASK |
+| LLM (orchestrator) | Root TASK, or latest active subagent TASK (`_task_order` fallback) |
+| LLM (subagent) | That subagent's TASK span |
+| TOOL | LLM span of the `AssistantMessage` containing the tool call |
+| Nested user span in tool handler | TOOL span (via `set_current()`) |
+
+---
+
+## Implementation Order
+
+Each step ends with a green `nox -s "test_claude_agent_sdk(latest)"` run.
+
+### Step 0 ✅ — Remove `_wrap_tool_factory`
+
+Done. Deleted the redundant `tool()` patch from `_wrapper.py` and `__init__.py`.
+
+### Step 1 ✅ — Extract task-event helpers to module-level functions
+
+Done. Added `_task_span_name()`, `_task_metadata()`, `_task_output()` as
+module-level functions. `TaskEventSpanTracker._span_name` / `._metadata` /
+`._output` now delegate to them.
+
+### Step 2 ✅ — Add `cleanup_context` / `cleanup_all` to `ToolSpanTracker`
+
+Done. Added both methods. Existing `cleanup()` left untouched.
+
+### Step 3 — Migrate mid-stream cleanup call in `receive_response`
+
+Replace the mid-stream `tool_tracker.cleanup(end_time=..., exclude_...,
+only_...)` call with `tool_tracker.cleanup_context(...)`. Simplify old
+`cleanup()` to delegate to `cleanup_all(end_time)`. Remove `_UNSET_PARENT` from
+`cleanup()`'s signature (the sentinel itself stays — `LLMSpanTracker` still
+uses it).
+
+**Dependencies:** Step 2.
+
+### Step 4 — Add `_AgentContext` and `ContextTracker`
+
+Implement the full `ContextTracker` class (dead code — not wired in yet).
+
+**Dependencies:** Steps 1 + 2.
+
+### Step 5 — Wire `ContextTracker` into `receive_response`; delete old classes
+
+- Rewrite `receive_response` to use `ContextTracker`.
+- Delete `LLMSpanTracker`, `TaskEventSpanTracker`, `_UNSET_PARENT`.
+- From `ToolSpanTracker`: remove `_pending_task_link_tool_use_ids` field +
+  property, `mark_task_started()`, the discard in `_end_tool_span` and
+  `start_tool_spans`, and old `cleanup()`.
+
+**Dependencies:** Steps 3 + 4.
+
+### Dependency graph
+
+```
+Step 0 (done)
+  │
+  ├─► Step 1 (extract helpers)       ─┐
+  │                                    ├─► Step 4 (ContextTracker) ─► Step 5 (wire + delete)
+  ├─► Step 2 (add cleanup methods)  ──┤
+  │                                    │
+  └─► Step 3 (migrate cleanup call) ──┘
+           ↑ depends on Step 2
+```
diff --git a/py/src/braintrust/wrappers/claude_agent_sdk/SIMPLIFICATION.md b/py/src/braintrust/wrappers/claude_agent_sdk/SIMPLIFICATION.md
new file mode 100644
index 00000000..d0b4d139
--- /dev/null
+++ b/py/src/braintrust/wrappers/claude_agent_sdk/SIMPLIFICATION.md
@@ -0,0 +1,366 @@
+# Simplification Analysis: Claude Agent SDK Instrumentation
+
+This document analyses the current three-tracker architecture and proposes concrete
+simplifications that reduce the number of trackers, eliminate redundant state, and
+make context routing explicit.
+
+---
+
+## 0. The Wrapper Layer
+
+The monkeypatch installs three wrappers, but they serve two completely different jobs:
+
+| Wrapper | Job |
+|---------|-----|
+| `WrappedClaudeSDKClient` | Stream processing — observes every SDK message, creates TASK/LLM/TOOL spans, drives all three trackers |
+| `WrappedSdkMcpTool` / `wrapped_tool_fn` | Handler activation — wraps tool handlers at registration time so they re-enter the pre-created TOOL span when the SDK calls them |
+
+The handler wrappers (`SdkMcpTool` and `tool`) are a bridge between two execution
+contexts: span *creation* happens on the stream side (controlled by Braintrust) and
+span *activation* happens on the handler side (called by the Claude SDK). See
+`INSTRUMENTATION.md § 1b` for the full two-phase handoff diagram.
+
+### 0a. `wrapped_tool_fn` is redundant and can be removed
+
+`claude_agent_sdk.tool()` is not an independent code path. Its entire body is:
+
+```python
+def decorator(handler) -> SdkMcpTool[Any]:
+    return SdkMcpTool(name=name, description=description, input_schema=input_schema, handler=handler, ...)
+return decorator
+```
+
+The `SdkMcpTool` name inside that function is resolved through `tool.__globals__`,
+which is `claude_agent_sdk.__dict__`. Patching `claude_agent_sdk.SdkMcpTool =
+WrappedSdkMcpTool` is therefore sufficient — every `tool()` call already routes
+through `WrappedSdkMcpTool.__init__`, which wraps the handler via
+`_wrap_tool_handler`. No separate `tool` patch is needed.
+
+This holds even for the `from claude_agent_sdk import tool` pre-import case that the
+`sys.modules` sweep was designed to handle: because `tool.__globals__ is
+claude_agent_sdk.__dict__`, the function always looks up `SdkMcpTool` from the
+module it was *defined* in, not from the importing module.
+
+The one real obstacle is that `tool()`'s inner `decorator` function has a
+`-> SdkMcpTool[Any]` return annotation that Python evaluates eagerly. This calls
+`__class_getitem__` on whatever `SdkMcpTool` currently is, which would raise
+`TypeError` on a plain subclass. The `__class_getitem__` override already present on
+`WrappedSdkMcpTool` handles this:
+
+```python
+__class_getitem__ = classmethod(lambda cls, params: cls)
+```
+
+**What can be removed:**
+
+| Location | What to remove |
+|----------|----------------|
+| `_wrapper.py` | `_wrap_tool_factory` function entirely |
+| `__init__.py` | `_wrap_tool_factory` import |
+| `__init__.py` | `original_tool_fn` / `wrapped_tool_fn` block and its `sys.modules` sweep |
+
+`WrappedSdkMcpTool` and its `__class_getitem__` override stay exactly as-is.
+
+The rest of this document focuses on the three tracker objects that live inside
+`receive_response()`.
+
+---
+
+## 1. Current Architecture: Three Trackers, Many Interactions
+
+The current implementation uses three distinct tracker objects that collaborate via
+method calls and shared references:
+
+```
+receive_response()
+  │
+  ├── LLMSpanTracker        — per-subagent-context LLM span lifecycle
+  ├── ToolSpanTracker       — live tool spans, dispatch queues, pending-task IDs
+  └── TaskEventSpanTracker  — TASK spans for subagents, needs a ref to ToolSpanTracker
+```
+
+They interact with each other in non-obvious ways:
+
+| Caller | Callee | Why |
+|--------|--------|-----|
+| `TaskEventSpanTracker.__init__` | receives `ToolSpanTracker` | needs `get_span_export()` to set task span parent |
+| `TaskEventSpanTracker.process` | `tool_tracker.mark_task_started()` | removes tool_use_id from `_pending_task_link_tool_use_ids` |
+| `receive_response` | `task_event_span_tracker.active_tool_use_ids` + `tool_tracker.pending_task_link_tool_use_ids` | builds combined exclusion set for cleanup |
+| `receive_response` | `task_event_span_tracker.parent_export_for_message()` | gets LLM span parent before calling `llm_tracker.start_llm_span()` |
+| `receive_response` | `llm_tracker.current_span_export` → passed to `tool_tracker.start_tool_spans()` | chains LLM export to tool parent |
+
+Five cross-tracker interactions in a hot loop. Every time a new subagent feature needs
+a change, the developer has to reason about all three trackers simultaneously.
+
+---
+
+## 2. Redundant and Duplicated State
+
+### 2a. Two half-pictures of the same "Agent tool call" lifecycle
+
+`ToolSpanTracker._pending_task_link_tool_use_ids` and
+`TaskEventSpanTracker._task_span_by_tool_use_id` together track the full lifecycle
+of an `Agent` tool call:
+
+```
+State              Stored in                     Description
+──────             ─────────                     ───────────
+Pending            ToolSpanTracker               Agent span created, TaskStarted not yet seen
+Linked             TaskEventSpanTracker           TaskStarted arrived, task_span_by_tool_use_id set
+Ended              (both remove the entry)        TaskNotification arrived
+```
+
+These two dictionaries key on `agent_tool_use_id` and always move in lockstep:
+`pending → linked` happens atomically in `process()` via `mark_task_started()`.
+The consumer in `receive_response` always reads *both*:
+
+```python
+active_subagent_tool_use_ids = (
+    task_event_span_tracker.active_tool_use_ids          # linked
+    | tool_tracker.pending_task_link_tool_use_ids         # pending
+)
+```
+
+This set union reconstructs information that was always a single set of "live agent
+tool calls". Splitting it between two trackers is unnecessary.
+
+### 2b. `LLMSpanTracker` and `TaskEventSpanTracker` share the same routing key
+
+Both trackers key their primary state on `parent_tool_use_id` (the agent tool call
+that spawned a subagent). The connection is direct:
+
+- `LLMSpanTracker._states[parent_tool_use_id]` → a subagent's LLM span state
+- `TaskEventSpanTracker._task_span_by_tool_use_id[parent_tool_use_id]` → a subagent's TASK span
+
+A subagent has exactly one TASK span and a sequence of LLM spans, all keyed by the
+same `parent_tool_use_id`. Keeping them in two different tracker objects means every
+subagent-related operation must touch two places.
+
+### 2c. `_active_context` is an implicit, mutable cursor
+
+`LLMSpanTracker._active_context` is set via `set_context()` before any method that
+should route to a specific subagent. The sentinel `_UNSET_PARENT = object()` then
+distinguishes "use active context" from "use orchestrator (None)".
+
+This makes it easy to introduce bugs where `set_context()` is forgotten or called
+out of order. The `mark_next_llm_start` method has an entire special-case block to
+compensate for `UserMessage`s that arrive with `parent_tool_use_id=None` while the
+active context is set to a subagent:
+
+```python
+def mark_next_llm_start(self, parent_tool_use_id=_UNSET_PARENT):
+    if parent_tool_use_id is None and self._active_context is not None:
+        parent_tool_use_id = _UNSET_PARENT   # fall back to active context
+    self._get_state(parent_tool_use_id).next_start_time = time.time()
+```
+
+This implicit fallback would be unnecessary if context routing were always explicit.
+
+### 2d. `cleanup()` has three orthogonal filter modes in one method
+
+```python
+def cleanup(
+    self,
+    end_time: float | None = None,
+    exclude_tool_use_ids: frozenset[str] | None = None,
+    only_parent_tool_use_id: Any = _UNSET_PARENT,   # sentinel again
+) -> None:
+```
+
+Three call sites, each using a different combination of parameters. This is a sign
+the method is doing three different jobs:
+
+1. **End-of-stream**: called with no filters — close everything.
+2. **Pre-LLM cleanup within a context**: called with `only_parent_tool_use_id` + `exclude_tool_use_ids` — close dangling tool spans scoped to one subagent, but skip live Agent spans.
+3. **Dangling-span cleanup**: called from tests with just `end_time` or no args.
+
+A simpler API would expose these three intents as distinct methods or with clearer
+parameter names that do not require a sentinel object.
+
+---
+
+## 3. What Is Genuinely Irreducible
+
+Not all complexity can be removed. The following pieces are load-bearing:
+
+### 3a. Per-subagent-context state
+
+Concurrent subagents interleave on a single message stream. Each subagent needs its
+own LLM span sequence and TASK span. Keying state on `parent_tool_use_id` (or `None`
+for the orchestrator) is the correct abstraction.
+
+### 3b. Dispatch queues in `ToolSpanTracker`
+
+When two subagents call the same tool with identical arguments, the handler receives
+only `(tool_name, args)` — not a `tool_use_id`. The FIFO dispatch queue maps the
+handler invocation order to the span creation order, which matches the Claude SDK's
+own execution order. This is necessary and correct.
+
+### 3c. Thread-local for handler-to-span bridging
+
+Tool handlers are called by the Claude SDK without any Braintrust context. A
+thread-local is the only way to bridge the active stream session to the handler.
+This cannot be removed without changing the SDK's calling convention.
+
+### 3d. `next_start_time` for non-overlapping sequential spans
+
+Stamping the time when a `UserMessage` with tool results arrives, then using that
+stamp as both the end time of the previous LLM span and the start time of the next
+one, is necessary to produce accurate, non-overlapping span timelines. This logic
+must live somewhere.
+
+---
+
+## 4. Proposed Simplifications
+
+### 4a. Merge `LLMSpanTracker` and `TaskEventSpanTracker` into `ContextTracker`
+
+Since both trackers key on `parent_tool_use_id`, merge them into a single object
+with one state record per subagent context:
+
+```python
+@dataclasses.dataclass
+class _AgentContext:
+    # LLM state (from LLMSpanTracker._SubagentState)
+    llm_span: Any | None = None
+    llm_span_export: str | None = None
+    llm_parent_export: str | None = None
+    llm_output: list | None = None
+    next_llm_start: float | None = None
+    # Task state (from TaskEventSpanTracker._task_span_by_tool_use_id)
+    task_span: Any | None = None
+    task_id: str | None = None
+
+class ContextTracker:
+    def __init__(self, root_span_export: str, query_start_time: float | None = None):
+        self._root_span_export = root_span_export
+        # parent_tool_use_id (or None for orchestrator) → _AgentContext
+        self._contexts: dict[str | None, _AgentContext] = {
+            None: _AgentContext(next_llm_start=query_start_time)
+        }
+        self._active_key: str | None = None  # still needed as a cursor, see 4b
+        self._task_order: list[str] = []     # for fallback parent resolution
+
+    def set_active(self, parent_tool_use_id: str | None) -> None: ...
+    def start_llm_span(self, message, prompt, history, parent_export) -> ...: ...
+    def mark_next_llm_start(self, parent_tool_use_id: str | None) -> None: ...
+    def process_task_event(self, message) -> None: ...  # replaces TaskEventSpanTracker.process
+    def llm_parent_export_for_message(self, message) -> str: ...
+    def log_usage(self, metrics) -> None: ...
+    def cleanup(self) -> None: ...
+```
+
+**What this removes:**
+- `TaskEventSpanTracker` as a separate class (≈ 100 lines of code).
+- The `ToolSpanTracker` constructor argument `tool_tracker` from `TaskEventSpanTracker`.
+- The `_task_span_by_tool_use_id` dict — it becomes `_contexts[tool_use_id].task_span`.
+- The `_active_task_order` list can stay on `ContextTracker` as `_task_order` for
+  the same fallback-parent purpose.
+
+**The two remaining `ToolSpanTracker` cross-calls** become:
+- `mark_task_started(tool_use_id)` → `ContextTracker.process_task_event` already knows
+  this; `ToolSpanTracker` can expose a simple `unlink_agent_span(tool_use_id)` or the
+  pending-ID set can move into `ContextTracker` entirely (see 4b).
+- `get_span_export(tool_use_id)` → `ContextTracker._contexts[tool_use_id].task_span.export()`
+
+### 4b. Move the "pending Agent spans" set into `ContextTracker`
+
+`ToolSpanTracker._pending_task_link_tool_use_ids` exists solely to tell `cleanup()`
+"don't close this Agent tool span, its TaskStarted hasn't arrived yet". The decision
+of whether an Agent span is pending or linked is owned by the task event lifecycle,
+which will live in `ContextTracker` after 4a. So the set belongs there.
+
+`ContextTracker` would track whether a context has been confirmed by `TaskStarted`
+as a boolean flag on `_AgentContext`:
+
+```python
+@dataclasses.dataclass
+class _AgentContext:
+    ...
+    task_confirmed: bool = False  # True after TaskStarted received
+```
+
+`ToolSpanTracker.cleanup()` would receive the full set of "live agent tool_use_ids"
+(both confirmed and unconfirmed) from `ContextTracker.live_agent_tool_use_ids` —
+a single property, not two properties unioned by the caller.
+
+### 4c. Make context routing explicit, remove the `_UNSET_PARENT` sentinel
+
+The `_UNSET_PARENT = object()` sentinel is a code smell — it is a non-serializable
+runtime object used as a dict key guard. The need for it arises because
+`mark_next_llm_start` has an implicit fallback: "if you passed `None` but there's
+an active subagent, use the active subagent instead."
+
+Replace the implicit fallback with explicit routing at the call site in
+`receive_response`, where the `UserMessage`'s `parent_tool_use_id` is already being
+read:
+
+```python
+# Before (implicit fallback inside LLMSpanTracker):
+llm_tracker.mark_next_llm_start(user_parent)
+
+# After (caller resolves the context before calling):
+resolved_context = user_parent if user_parent is not None else self._active_context
+context_tracker.mark_next_llm_start(resolved_context)
+```
+
+With this change, `_UNSET_PARENT` can be deleted along with the fallback branch
+inside `mark_next_llm_start`. The tracker method signature becomes simply
+`mark_next_llm_start(context_key: str | None)`.
+
+### 4d. Simplify `ToolSpanTracker.cleanup()` into two focused methods
+
+Replace the three-mode method with two explicit ones:
+
+```python
+def cleanup_context(self, parent_tool_use_id: str | None, *, end_time: float | None = None, exclude_ids: frozenset[str] = frozenset()) -> None:
+    """Close all active tool spans belonging to a specific subagent context,
+    optionally skipping Agent spans that are still live."""
+
+def cleanup_all(self, end_time: float | None = None) -> None:
+    """Close all remaining active spans. Called at end-of-stream."""
+```
+
+The three call sites in `receive_response` and tests map cleanly:
+- Pre-LLM cleanup → `cleanup_context(incoming_parent, end_time=..., exclude_ids=live_agent_ids)`
+- End-of-stream → `cleanup_all()`
+- Test helpers → `cleanup_all()` or `cleanup_context(...)`
+
+No sentinel needed; the filter intent is expressed in the method name.
+
+---
+
+## 5. Summary of Changes
+
+| Change | Effect |
+|--------|--------|
+| Merge `LLMSpanTracker` + `TaskEventSpanTracker` → `ContextTracker` | −1 tracker class, eliminates constructor coupling, unifies per-subagent state |
+| Move `_pending_task_link_tool_use_ids` into `ContextTracker` | Eliminates two-property union at call site, single source of truth for Agent span liveness |
+| Remove `_UNSET_PARENT` sentinel | Eliminates implicit fallback, makes `receive_response` loop more readable |
+| Split `cleanup()` into `cleanup_context()` + `cleanup_all()` | Clarifies intent at each call site, removes three-mode parameter combination |
+
+**Trackers before:** 3 (`ToolSpanTracker`, `LLMSpanTracker`, `TaskEventSpanTracker`)
+**Trackers after:** 2 (`ToolSpanTracker`, `ContextTracker`)
+
+**Cross-tracker interactions before:** 5 (see §1 table)
+**Cross-tracker interactions after:** 2 (ContextTracker gives ToolSpanTracker the live-agent-id set for cleanup; ToolSpanTracker gives ContextTracker a task span parent export via `get_span_export`)
+
+---
+
+## 6. What Does Not Change
+
+- **`WrappedSdkMcpTool`** — the handler-side wrapper is a separate concern (span
+  activation, not span creation) and is entirely unaffected. See
+  `INSTRUMENTATION.md § 1b`. `wrapped_tool_fn` is removed as part of § 0a above.
+- The `_dispatch_queues` FIFO mechanism in `ToolSpanTracker` — still required.
+- The thread-local for handler bridging — still required. The handler wrappers read
+  it to find the active `ToolSpanTracker`; after this refactor they would read it to
+  find the active `ToolSpanTracker` inside `ContextTracker` (or a direct reference
+  to the same object — the public API is unchanged).
+- The `next_llm_start` stamping logic — still required, just moves into `_AgentContext`.
+- The `_active_context` / `set_active()` cursor on `ContextTracker` — still needed
+  because `AssistantMessage` arrives with a `parent_tool_use_id` that sets routing
+  for the rest of that message's processing. The cursor avoids threading it through
+  every call signature inside the message loop.
+- The test surface — all existing unit and integration tests remain valid; only
+  the internal class and method names change.
diff --git a/py/src/braintrust/wrappers/claude_agent_sdk/_wrapper.py b/py/src/braintrust/wrappers/claude_agent_sdk/_wrapper.py
index 3521be02..47f0168a 100644
--- a/py/src/braintrust/wrappers/claude_agent_sdk/_wrapper.py
+++ b/py/src/braintrust/wrappers/claude_agent_sdk/_wrapper.py
@@ -308,20 +308,8 @@ def finish_tool_spans(self, message: Any) -> None:
 
             self._end_tool_span(str(tool_use_id), tool_result_block=block)
 
-    def cleanup(
-        self,
-        end_time: float | None = None,
-        exclude_tool_use_ids: frozenset[str] | None = None,
-        only_parent_tool_use_id: Any = _UNSET_PARENT,
-    ) -> None:
-        for tool_use_id in list(self._active_spans):
-            if exclude_tool_use_ids and tool_use_id in exclude_tool_use_ids:
-                continue
-            if only_parent_tool_use_id is not _UNSET_PARENT:
-                active = self._active_spans.get(tool_use_id)
-                if active is not None and active.parent_tool_use_id != only_parent_tool_use_id:
-                    continue
-            self._end_tool_span(tool_use_id, end_time=end_time)
+    def cleanup(self, end_time: float | None = None) -> None:
+        self.cleanup_all(end_time=end_time)
 
     def cleanup_context(
         self,
@@ -851,10 +839,10 @@ async def receive_response(self) -> AsyncGenerator[Any, None]:
                                     task_event_span_tracker.active_tool_use_ids
                                     | tool_tracker.pending_task_link_tool_use_ids
                                 )
-                                tool_tracker.cleanup(
+                                tool_tracker.cleanup_context(
+                                    incoming_parent,
                                     end_time=llm_tracker.get_next_start_time(),
-                                    exclude_tool_use_ids=active_subagent_tool_use_ids,
-                                    only_parent_tool_use_id=incoming_parent,
+                                    exclude_ids=active_subagent_tool_use_ids,
                                 )
                             llm_parent_export = task_event_span_tracker.parent_export_for_message(
                                 message,
diff --git a/py/src/braintrust/wrappers/claude_agent_sdk/test_wrapper.py b/py/src/braintrust/wrappers/claude_agent_sdk/test_wrapper.py
index 02386156..1ea93d84 100644
--- a/py/src/braintrust/wrappers/claude_agent_sdk/test_wrapper.py
+++ b/py/src/braintrust/wrappers/claude_agent_sdk/test_wrapper.py
@@ -2146,12 +2146,10 @@ async def test_concurrent_subagent_tool_output_not_silently_dropped(memory_logge
         assert tracker.has_active_spans, "Tool span should be active after start_tool_spans"
 
         # Cleanup triggered by beta's AssistantMessage — scoped to beta's context
-        tracker.cleanup(only_parent_tool_use_id="call-beta")
+        tracker.cleanup_context("call-beta")
 
         # Alpha's tool span should still be active
-        assert tracker.has_active_spans, (
-            "cleanup(only_parent_tool_use_id='call-beta') should not end alpha's tool span"
-        )
+        assert tracker.has_active_spans, "cleanup_context('call-beta') should not end alpha's tool span"
 
         # Alpha's ToolResultBlock arrives and should be recorded
         tracker.finish_tool_spans(
@@ -2199,7 +2197,7 @@ def test_tool_span_tracker_cleanup_preserves_cross_subagent_spans(memory_logger)
         )
 
         # Cleanup triggered by beta's AssistantMessage — scoped to beta
-        tracker.cleanup(only_parent_tool_use_id="call-beta")
+        tracker.cleanup_context("call-beta")
 
         # Alpha's span should still be active
         assert tracker.has_active_spans, "Alpha's tool span should survive beta-scoped cleanup"

From 2931fc1d3b2b1f0f8db34554610c59a7555d5c6c Mon Sep 17 00:00:00 2001
From: Abhijeet Prasad <abhijeet@braintrustdata.com>
Date: Thu, 19 Mar 2026 00:05:53 -0400
Subject: [PATCH 06/12] Add _AgentContext and ContextTracker
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Step 4 of the Claude Agent SDK instrumentation simplification plan.

Add the _AgentContext dataclass and ContextTracker class that will
replace LLMSpanTracker + TaskEventSpanTracker. ContextTracker owns a
private ToolSpanTracker and provides a single add() method that
dispatches SDK messages to internal handlers.

This is dead code — not wired into receive_response yet. The next step
(Step 5) will do the switchover and delete the old tracker classes.
---
 .../wrappers/claude_agent_sdk/_wrapper.py     | 265 ++++++++++++++++++
 1 file changed, 265 insertions(+)

diff --git a/py/src/braintrust/wrappers/claude_agent_sdk/_wrapper.py b/py/src/braintrust/wrappers/claude_agent_sdk/_wrapper.py
index 47f0168a..1d1528e6 100644
--- a/py/src/braintrust/wrappers/claude_agent_sdk/_wrapper.py
+++ b/py/src/braintrust/wrappers/claude_agent_sdk/_wrapper.py
@@ -746,6 +746,271 @@ def _message_starts_subagent_tool(message: Any) -> bool:
     return False
 
 
+@dataclasses.dataclass
+class _AgentContext:
+    """Per-subagent-context state, keyed by parent_tool_use_id (None = orchestrator)."""
+
+    llm_span: Any | None = None
+    llm_output: list[dict[str, Any]] | None = None
+    next_llm_start: float | None = None
+    task_span: Any | None = None
+    task_confirmed: bool = False
+
+
+class ContextTracker:
+    """Single consumer of the raw SDK message stream.
+
+    Replaces LLMSpanTracker + TaskEventSpanTracker with unified per-subagent
+    context tracking.  Owns a private ToolSpanTracker instance.
+    """
+
+    def __init__(
+        self,
+        root_span: Any,
+        prompt: Any,
+        query_start_time: float | None = None,
+    ) -> None:
+        self._root_span = root_span
+        self._root_span_export = root_span.export()
+        self._prompt = prompt
+
+        self._tool_tracker = ToolSpanTracker()
+        self._contexts: dict[str | None, _AgentContext] = {None: _AgentContext(next_llm_start=query_start_time)}
+        self._active_key: str | None = None
+        self._task_order: list[str | None] = []
+
+        self._final_results: list[dict[str, Any]] = []
+        self._task_events: list[dict[str, Any]] = []
+
+        _thread_local.tool_span_tracker = self._tool_tracker
+
+    # -- public API --
+
+    def add(self, message: Any) -> None:
+        """Consume one SDK message and update spans accordingly."""
+        message_type = type(message).__name__
+        if message_type == MessageClassName.ASSISTANT:
+            self._handle_assistant(message)
+        elif message_type == MessageClassName.USER:
+            self._handle_user(message)
+        elif message_type == MessageClassName.RESULT:
+            self._handle_result(message)
+        elif message_type in SYSTEM_MESSAGE_TYPES:
+            self._handle_system(message)
+
+    def log_output(self) -> None:
+        """Log the last accumulated assistant message as the root span output."""
+        if self._final_results:
+            self._root_span.log(output=self._final_results[-1])
+
+    def log_tasks(self) -> None:
+        """Flush accumulated task events to the root span metadata."""
+        if self._task_events:
+            self._root_span.log(metadata={"task_events": self._task_events})
+
+    def cleanup(self) -> None:
+        """End all open LLM spans, TASK spans, and TOOL spans; clear thread-local."""
+        for ctx in self._contexts.values():
+            if ctx.llm_span:
+                ctx.llm_span.end()
+                ctx.llm_span = None
+            if ctx.task_span:
+                ctx.task_span.end()
+                ctx.task_span = None
+        self._task_order.clear()
+        self._tool_tracker.cleanup_all()
+        if hasattr(_thread_local, "tool_span_tracker"):
+            delattr(_thread_local, "tool_span_tracker")
+
+    # -- internal handlers --
+
+    def _handle_assistant(self, message: Any) -> None:
+        incoming_parent = getattr(message, "parent_tool_use_id", None)
+        self._active_key = incoming_parent
+        ctx = self._get_context(incoming_parent)
+
+        # Close dangling tool spans from the previous turn in this context.
+        if ctx.llm_span and self._tool_tracker.has_active_spans:
+            self._tool_tracker.cleanup_context(
+                incoming_parent,
+                end_time=ctx.next_llm_start or time.time(),
+                exclude_ids=self._live_agent_tool_use_ids(),
+            )
+
+        parent_export = self._llm_parent_for_message(message)
+        final_content, extended = self._start_or_merge_llm_span(message, parent_export, ctx)
+
+        llm_export = ctx.llm_span.export() if ctx.llm_span else None
+        self._tool_tracker.start_tool_spans(message, llm_export)
+
+        self._register_pending_agent_contexts(message)
+
+        if final_content:
+            if extended and self._final_results and self._final_results[-1].get("role") == "assistant":
+                self._final_results[-1] = final_content
+            else:
+                self._final_results.append(final_content)
+
+    def _handle_user(self, message: Any) -> None:
+        self._tool_tracker.finish_tool_spans(message)
+        has_tool_results = False
+        if hasattr(message, "content"):
+            has_tool_results = any(type(b).__name__ == BlockClassName.TOOL_RESULT for b in message.content)
+            content = _serialize_content_blocks(message.content)
+            self._final_results.append({"content": content, "role": "user"})
+        if has_tool_results:
+            user_parent = getattr(message, "parent_tool_use_id", None)
+            resolved_key = user_parent if user_parent is not None else self._active_key
+            self._get_context(resolved_key).next_llm_start = time.time()
+
+    def _handle_result(self, message: Any) -> None:
+        self._active_key = None
+        if hasattr(message, "usage"):
+            usage_metrics = _extract_usage_from_result_message(message)
+            ctx = self._get_context(None)
+            if ctx.llm_span and usage_metrics:
+                ctx.llm_span.log(metrics=usage_metrics)
+        result_metadata = {
+            k: v
+            for k, v in {
+                "num_turns": getattr(message, "num_turns", None),
+                "session_id": getattr(message, "session_id", None),
+            }.items()
+            if v is not None
+        }
+        if result_metadata:
+            self._root_span.log(metadata=result_metadata)
+
+    def _handle_system(self, message: Any) -> None:
+        agent_span_export = self._tool_tracker.get_span_export(getattr(message, "tool_use_id", None))
+        self._process_task_event(message, agent_span_export)
+        self._task_events.append(_serialize_system_message(message))
+
+    # -- internal helpers --
+
+    def _get_context(self, key: str | None) -> _AgentContext:
+        ctx = self._contexts.get(key)
+        if ctx is None:
+            ctx = _AgentContext()
+            self._contexts[key] = ctx
+        return ctx
+
+    def _register_pending_agent_contexts(self, message: Any) -> None:
+        """Pre-create _AgentContext for Agent tool calls (task_confirmed=False)."""
+        if not hasattr(message, "content"):
+            return
+        for block in message.content:
+            if type(block).__name__ == BlockClassName.TOOL_USE and getattr(block, "name", None) == "Agent":
+                tool_use_id = getattr(block, "id", None)
+                if tool_use_id:
+                    self._get_context(str(tool_use_id))
+
+    def _live_agent_tool_use_ids(self) -> frozenset[str]:
+        """Return tool_use_ids of Agent spans that must not be closed yet."""
+        result: set[str] = set()
+        for key, ctx in self._contexts.items():
+            if key is None:
+                continue
+            if not ctx.task_confirmed or ctx.task_span is not None:
+                result.add(key)
+        return frozenset(result)
+
+    def _llm_parent_for_message(self, message: Any) -> str:
+        """Determine the parent span export for an incoming AssistantMessage."""
+        parent_tool_use_id = getattr(message, "parent_tool_use_id", None)
+        if parent_tool_use_id is not None:
+            ctx = self._contexts.get(str(parent_tool_use_id))
+            if ctx is not None and ctx.task_span is not None:
+                return ctx.task_span.export()
+
+        if _message_starts_subagent_tool(message):
+            return self._root_span_export
+
+        for key in reversed(self._task_order):
+            ctx = self._contexts.get(key)
+            if ctx is not None and ctx.task_span is not None:
+                return ctx.task_span.export()
+
+        return self._root_span_export
+
+    def _start_or_merge_llm_span(
+        self,
+        message: Any,
+        parent_export: str | None,
+        ctx: _AgentContext,
+    ) -> tuple[dict[str, Any] | None, bool]:
+        """Start a new LLM span or extend the existing one via merge."""
+        current_message = _serialize_assistant_message(message)
+
+        # Merge path.
+        if ctx.llm_span and ctx.next_llm_start is None and current_message is not None:
+            merged = _merge_assistant_messages(
+                ctx.llm_output[0] if ctx.llm_output else None,
+                current_message,
+            )
+            if merged is not None:
+                ctx.llm_output = [merged]
+                ctx.llm_span.log(output=ctx.llm_output)
+            return merged, True
+
+        # New span path.
+        resolved_start = ctx.next_llm_start or time.time()
+        first_token_time = time.time()
+
+        if ctx.llm_span:
+            ctx.llm_span.end(end_time=resolved_start)
+
+        final_content, span = _create_llm_span_for_messages(
+            [message],
+            self._prompt,
+            self._final_results,
+            parent=parent_export,
+            start_time=resolved_start,
+        )
+        if span is not None:
+            span.log(metrics={"time_to_first_token": max(0.0, first_token_time - resolved_start)})
+        ctx.llm_span = span
+        ctx.llm_output = [final_content] if final_content is not None else None
+        ctx.next_llm_start = None
+        return final_content, False
+
+    def _process_task_event(self, message: Any, agent_span_export: str | None) -> None:
+        """Handle TaskStarted / TaskProgress / TaskNotification system messages."""
+        task_id = getattr(message, "task_id", None)
+        if task_id is None:
+            return
+        task_id = str(task_id)
+        tool_use_id = getattr(message, "tool_use_id", None)
+        tool_use_id_str = str(tool_use_id) if tool_use_id is not None else None
+        ctx = self._get_context(tool_use_id_str)
+        message_type = type(message).__name__
+
+        if ctx.task_span is None:
+            ctx.task_span = start_span(
+                name=_task_span_name(message, task_id),
+                span_attributes={"type": SpanTypeAttribute.TASK},
+                metadata=_task_metadata(message),
+                parent=agent_span_export or self._root_span_export,
+            )
+            ctx.task_confirmed = True
+            self._task_order.append(tool_use_id_str)
+        else:
+            update: dict[str, Any] = {}
+            metadata = _task_metadata(message)
+            if metadata:
+                update["metadata"] = metadata
+            output = _task_output(message)
+            if output is not None:
+                update["output"] = output
+            if update:
+                ctx.task_span.log(**update)
+
+        if message_type == MessageClassName.TASK_NOTIFICATION:
+            ctx.task_span.end()
+            ctx.task_span = None
+            self._task_order = [k for k in self._task_order if k != tool_use_id_str]
+
+
 def _create_client_wrapper_class(original_client_class: Any) -> Any:
     """Creates a wrapper class for ClaudeSDKClient that wraps query and receive_response."""
 

From ffd9f9899c1132cf4558bfae03989841643a2b38 Mon Sep 17 00:00:00 2001
From: Abhijeet Prasad <abhijeet@braintrustdata.com>
Date: Thu, 19 Mar 2026 00:11:37 -0400
Subject: [PATCH 07/12] Wire ContextTracker into receive_response; delete old
 classes

Step 5 of the Claude Agent SDK instrumentation simplification plan.

- Rewrite receive_response to use ContextTracker.add() in the loop,
  replacing ~80 lines of three-tracker dispatch logic with a thin
  iterator wrapper.
- Delete LLMSpanTracker class entirely.
- Delete TaskEventSpanTracker class entirely (module-level helpers
  _task_span_name, _task_metadata, _task_output remain).
- Delete _UNSET_PARENT sentinel.
- From ToolSpanTracker: remove _pending_task_link_tool_use_ids field +
  property, mark_task_started(), the discard in _end_tool_span and
  start_tool_spans, and the old cleanup() shim.
- Retain llm_parent_export on _AgentContext to guard the merge path
  against parent changes (needed when subagent AssistantMessages arrive
  with parent_tool_use_id=None after an orchestrator message).
- Update tests: tracker.cleanup() -> tracker.cleanup_all().
---
 .../wrappers/claude_agent_sdk/_wrapper.py     | 369 ++----------------
 .../wrappers/claude_agent_sdk/test_wrapper.py |  12 +-
 2 files changed, 28 insertions(+), 353 deletions(-)

diff --git a/py/src/braintrust/wrappers/claude_agent_sdk/_wrapper.py b/py/src/braintrust/wrappers/claude_agent_sdk/_wrapper.py
index 1d1528e6..c37745d3 100644
--- a/py/src/braintrust/wrappers/claude_agent_sdk/_wrapper.py
+++ b/py/src/braintrust/wrappers/claude_agent_sdk/_wrapper.py
@@ -39,10 +39,6 @@ class ParsedToolName:
     mcp_server: str | None = None
 
 
-_UNSET_PARENT = object()
-"""Sentinel to distinguish 'no filter' from 'filter to orchestrator (None)'."""
-
-
 @dataclasses.dataclass
 class _ActiveToolSpan:
     span: Any
@@ -236,7 +232,6 @@ def _make_dispatch_key(tool_name: str, tool_input: Any) -> tuple[str, str]:
 class ToolSpanTracker:
     def __init__(self):
         self._active_spans: dict[str, _ActiveToolSpan] = {}
-        self._pending_task_link_tool_use_ids: set[str] = set()
         # Per-(tool_name, input_signature) FIFO queue of tool_use_ids.
         # Used by acquire_span_for_handler to disambiguate identical concurrent
         # tool calls (same name + same input) from sibling subagents.
@@ -291,8 +286,6 @@ def start_tool_spans(self, message: Any, llm_span_export: str | None) -> None:
             )
             dispatch_key = _make_dispatch_key(parsed_tool_name.raw_name, tool_input)
             self._dispatch_queues.setdefault(dispatch_key, collections.deque()).append(tool_use_id)
-            if parsed_tool_name.display_name == "Agent":
-                self._pending_task_link_tool_use_ids.add(tool_use_id)
 
     def finish_tool_spans(self, message: Any) -> None:
         if not hasattr(message, "content"):
@@ -308,9 +301,6 @@ def finish_tool_spans(self, message: Any) -> None:
 
             self._end_tool_span(str(tool_use_id), tool_result_block=block)
 
-    def cleanup(self, end_time: float | None = None) -> None:
-        self.cleanup_all(end_time=end_time)
-
     def cleanup_context(
         self,
         parent_tool_use_id: str | None,
@@ -339,16 +329,6 @@ def cleanup_all(self, end_time: float | None = None) -> None:
     def has_active_spans(self) -> bool:
         return bool(self._active_spans)
 
-    @property
-    def pending_task_link_tool_use_ids(self) -> frozenset[str]:
-        return frozenset(self._pending_task_link_tool_use_ids)
-
-    def mark_task_started(self, tool_use_id: Any) -> None:
-        if tool_use_id is None:
-            return
-
-        self._pending_task_link_tool_use_ids.discard(str(tool_use_id))
-
     def acquire_span_for_handler(self, tool_name: Any, args: Any) -> _ActiveToolSpan | None:
         parsed_tool_name = _parse_tool_name(tool_name)
         candidate_names = list(
@@ -397,7 +377,6 @@ def _end_tool_span(
         self, tool_use_id: str, tool_result_block: Any | None = None, end_time: float | None = None
     ) -> None:
         active_tool_span = self._active_spans.pop(tool_use_id, None)
-        self._pending_task_link_tool_use_ids.discard(tool_use_id)
         if active_tool_span is None:
             return
 
@@ -460,138 +439,6 @@ def _activate_tool_span_for_handler(tool_name: Any, args: Any) -> _ActiveToolSpa
     return tool_span_tracker.acquire_span_for_handler(tool_name, args) or _NOOP_ACTIVE_TOOL_SPAN
 
 
-class LLMSpanTracker:
-    """Manages LLM span lifecycle for Claude Agent SDK message streams.
-
-    Message flow per turn:
-    1. UserMessage (tool results) -> mark the time when next LLM will start
-    2. AssistantMessage - LLM response arrives -> create span with the marked start time, ending previous span
-    3. ResultMessage - usage metrics -> log to span
-
-    We end the previous span when the next AssistantMessage arrives, using the marked
-    start time to ensure sequential spans (no overlapping LLM spans).
-
-    Each subagent context (identified by parent_tool_use_id) gets its own independent
-    span state so concurrent subagents don't truncate each other's LLM spans.
-    """
-
-    @dataclasses.dataclass
-    class _SubagentState:
-        current_span: Any | None = None
-        current_span_export: str | None = None
-        current_parent_export: str | None = None
-        current_output: list[dict[str, Any]] | None = None
-        next_start_time: float | None = None
-
-    def __init__(self, query_start_time: float | None = None):
-        self._states: dict[str | None, LLMSpanTracker._SubagentState] = {}
-        self._active_context: str | None = None
-        # Seed the orchestrator context (parent_tool_use_id=None) with the
-        # query start time so the first orchestrator LLM span gets the right start.
-        self._states[None] = self._SubagentState(next_start_time=query_start_time)
-
-    def _get_state(self, parent_tool_use_id: str | None = _UNSET_PARENT) -> "_SubagentState":
-        key = self._active_context if parent_tool_use_id is _UNSET_PARENT else parent_tool_use_id
-        state = self._states.get(key)
-        if state is None:
-            state = self._SubagentState()
-            self._states[key] = state
-        return state
-
-    @property
-    def current_span(self) -> Any | None:
-        return self._get_state().current_span
-
-    @property
-    def current_span_export(self) -> str | None:
-        return self._get_state().current_span_export
-
-    def set_context(self, parent_tool_use_id: str | None) -> None:
-        """Set which subagent context subsequent calls operate on."""
-        self._active_context = parent_tool_use_id
-
-    def get_next_start_time(self) -> float:
-        state = self._get_state()
-        return state.next_start_time if state.next_start_time is not None else time.time()
-
-    def start_llm_span(
-        self,
-        message: Any,
-        prompt: Any,
-        conversation_history: list[dict[str, Any]],
-        parent_export: str | None = None,
-        start_time: float | None = None,
-    ) -> tuple[dict[str, Any] | None, bool]:
-        """Start a new LLM span, ending the previous one *in the same context*."""
-        state = self._get_state()
-        current_message = _serialize_assistant_message(message)
-
-        if (
-            state.current_span
-            and state.next_start_time is None
-            and state.current_parent_export == parent_export
-            and current_message is not None
-        ):
-            merged_message = _merge_assistant_messages(
-                state.current_output[0] if state.current_output else None,
-                current_message,
-            )
-            if merged_message is not None:
-                state.current_output = [merged_message]
-                state.current_span.log(output=state.current_output)
-            return merged_message, True
-
-        resolved_start_time = start_time if start_time is not None else self.get_next_start_time()
-        first_token_time = time.time()
-
-        if state.current_span:
-            state.current_span.end(end_time=resolved_start_time)
-
-        final_content, span = _create_llm_span_for_messages(
-            [message],
-            prompt,
-            conversation_history,
-            parent=parent_export,
-            start_time=resolved_start_time,
-        )
-        if span is not None:
-            span.log(metrics={"time_to_first_token": max(0.0, first_token_time - resolved_start_time)})
-        state.current_span = span
-        state.current_span_export = span.export() if span else None
-        state.current_parent_export = parent_export
-        state.current_output = [final_content] if final_content is not None else None
-        state.next_start_time = None
-        return final_content, False
-
-    def mark_next_llm_start(self, parent_tool_use_id: Any = _UNSET_PARENT) -> None:
-        """Mark when the next LLM call will start (after tool results).
-
-        When ``parent_tool_use_id`` is ``None`` (i.e. the message lacks the
-        attribute) but we have an active subagent context, fall back to the
-        active context so the timestamp lands on the correct subagent state
-        rather than the orchestrator state.
-        """
-        if parent_tool_use_id is None and self._active_context is not None:
-            parent_tool_use_id = _UNSET_PARENT
-        self._get_state(parent_tool_use_id).next_start_time = time.time()
-
-    def log_usage(self, usage_metrics: dict[str, float]) -> None:
-        """Log usage metrics to the current LLM span."""
-        state = self._get_state()
-        if state.current_span and usage_metrics:
-            state.current_span.log(metrics=usage_metrics)
-
-    def cleanup(self) -> None:
-        """End any unclosed spans across all subagent contexts."""
-        for state in self._states.values():
-            if state.current_span:
-                state.current_span.end()
-                state.current_span = None
-                state.current_span_export = None
-                state.current_parent_export = None
-                state.current_output = None
-
-
 def _task_span_name(message: Any, task_id: str) -> str:
     return getattr(message, "description", None) or getattr(message, "task_type", None) or f"Task {task_id}"
 
@@ -629,110 +476,6 @@ def _task_output(message: Any) -> dict[str, Any] | None:
     }
 
 
-class TaskEventSpanTracker:
-    def __init__(self, root_span_export: str, tool_tracker: ToolSpanTracker):
-        self._root_span_export = root_span_export
-        self._tool_tracker = tool_tracker
-        self._active_spans: dict[str, Any] = {}
-        self._task_span_by_tool_use_id: dict[str, Any] = {}
-        self._active_task_order: list[str] = []
-
-    def process(self, message: Any) -> None:
-        task_id = getattr(message, "task_id", None)
-        if task_id is None:
-            return
-
-        task_id = str(task_id)
-        message_type = type(message).__name__
-        task_span = self._active_spans.get(task_id)
-
-        if task_span is None:
-            task_span = start_span(
-                name=self._span_name(message, task_id),
-                span_attributes={"type": SpanTypeAttribute.TASK},
-                metadata=self._metadata(message),
-                parent=self._parent_export(message),
-            )
-            self._active_spans[task_id] = task_span
-            self._active_task_order.append(task_id)
-            tool_use_id = getattr(message, "tool_use_id", None)
-            if tool_use_id is not None:
-                tool_use_id = str(tool_use_id)
-                self._task_span_by_tool_use_id[tool_use_id] = task_span
-                self._tool_tracker.mark_task_started(tool_use_id)
-        else:
-            update: dict[str, Any] = {}
-            metadata = self._metadata(message)
-            if metadata:
-                update["metadata"] = metadata
-
-            output = self._output(message)
-            if output is not None:
-                update["output"] = output
-
-            if update:
-                task_span.log(**update)
-
-        if self._should_end(message_type):
-            tool_use_id = getattr(message, "tool_use_id", None)
-            if tool_use_id is not None:
-                self._task_span_by_tool_use_id.pop(str(tool_use_id), None)
-            task_span.end()
-            del self._active_spans[task_id]
-            self._active_task_order = [
-                active_task_id for active_task_id in self._active_task_order if active_task_id != task_id
-            ]
-
-    @property
-    def active_tool_use_ids(self) -> frozenset[str]:
-        return frozenset(self._task_span_by_tool_use_id.keys())
-
-    def cleanup(self) -> None:
-        for task_id, span in list(self._active_spans.items()):
-            span.end()
-            del self._active_spans[task_id]
-        self._task_span_by_tool_use_id.clear()
-        self._active_task_order.clear()
-
-    def parent_export_for_message(self, message: Any, fallback_export: str) -> str:
-        parent_tool_use_id = getattr(message, "parent_tool_use_id", None)
-        if parent_tool_use_id is None:
-            if _message_starts_subagent_tool(message):
-                return fallback_export
-            active_task_export = self._latest_active_task_export()
-            return active_task_export or fallback_export
-
-        task_span = self._task_span_by_tool_use_id.get(str(parent_tool_use_id))
-        if task_span is not None:
-            return task_span.export()
-
-        active_task_export = self._latest_active_task_export()
-        return active_task_export or fallback_export
-
-    def _latest_active_task_export(self) -> str | None:
-        for task_id in reversed(self._active_task_order):
-            task_span = self._active_spans.get(task_id)
-            if task_span is not None:
-                return task_span.export()
-
-        return None
-
-    def _parent_export(self, message: Any) -> str:
-        return self._tool_tracker.get_span_export(getattr(message, "tool_use_id", None)) or self._root_span_export
-
-    def _span_name(self, message: Any, task_id: str) -> str:
-        return _task_span_name(message, task_id)
-
-    def _metadata(self, message: Any) -> dict[str, Any]:
-        return _task_metadata(message)
-
-    def _output(self, message: Any) -> dict[str, Any] | None:
-        return _task_output(message)
-
-    def _should_end(self, message_type: str) -> bool:
-        return message_type == MessageClassName.TASK_NOTIFICATION
-
-
 def _message_starts_subagent_tool(message: Any) -> bool:
     if not hasattr(message, "content"):
         return False
@@ -751,6 +494,7 @@ class _AgentContext:
     """Per-subagent-context state, keyed by parent_tool_use_id (None = orchestrator)."""
 
     llm_span: Any | None = None
+    llm_parent_export: str | None = None
     llm_output: list[dict[str, Any]] | None = None
     next_llm_start: float | None = None
     task_span: Any | None = None
@@ -943,7 +687,12 @@ def _start_or_merge_llm_span(
         current_message = _serialize_assistant_message(message)
 
         # Merge path.
-        if ctx.llm_span and ctx.next_llm_start is None and current_message is not None:
+        if (
+            ctx.llm_span
+            and ctx.next_llm_start is None
+            and ctx.llm_parent_export == parent_export
+            and current_message is not None
+        ):
             merged = _merge_assistant_messages(
                 ctx.llm_output[0] if ctx.llm_output else None,
                 current_message,
@@ -970,6 +719,7 @@ def _start_or_merge_llm_span(
         if span is not None:
             span.log(metrics={"time_to_first_token": max(0.0, first_token_time - resolved_start)})
         ctx.llm_span = span
+        ctx.llm_parent_export = parent_export
         ctx.llm_output = [final_content] if final_content is not None else None
         ctx.next_llm_start = None
         return final_content, False
@@ -1075,91 +825,23 @@ async def receive_response(self) -> AsyncGenerator[Any, None]:
                 span_attributes={"type": SpanTypeAttribute.TASK},
                 input=initial_input,
             ) as span:
-                # If we're capturing async messages, we'll update input after they're consumed
                 input_needs_update = self.__captured_messages is not None
-
-                final_results: list[dict[str, Any]] = []
-                task_events: list[dict[str, Any]] = []
-                llm_tracker = LLMSpanTracker(query_start_time=self.__query_start_time)
-                tool_tracker = ToolSpanTracker()
-                task_event_span_tracker = TaskEventSpanTracker(span.export(), tool_tracker)
-                _thread_local.tool_span_tracker = tool_tracker
+                context_tracker = ContextTracker(
+                    root_span=span,
+                    prompt=self.__last_prompt,
+                    query_start_time=self.__query_start_time,
+                )
 
                 try:
                     async for message in generator:
-                        # Update input from captured async messages (once, after they're consumed)
+                        # One-shot: update root span input from async-generator prompt.
                         if input_needs_update:
-                            captured_input = self.__captured_messages if self.__captured_messages else []
-                            if captured_input:
-                                span.log(input=captured_input)
+                            captured = self.__captured_messages or []
+                            if captured:
+                                span.log(input=captured)
                             input_needs_update = False
 
-                        message_type = type(message).__name__
-
-                        if message_type == MessageClassName.ASSISTANT:
-                            incoming_parent = getattr(message, "parent_tool_use_id", None)
-                            llm_tracker.set_context(incoming_parent)
-                            if llm_tracker.current_span and tool_tracker.has_active_spans:
-                                active_subagent_tool_use_ids = (
-                                    task_event_span_tracker.active_tool_use_ids
-                                    | tool_tracker.pending_task_link_tool_use_ids
-                                )
-                                tool_tracker.cleanup_context(
-                                    incoming_parent,
-                                    end_time=llm_tracker.get_next_start_time(),
-                                    exclude_ids=active_subagent_tool_use_ids,
-                                )
-                            llm_parent_export = task_event_span_tracker.parent_export_for_message(
-                                message,
-                                span.export(),
-                            )
-                            final_content, extended_existing_span = llm_tracker.start_llm_span(
-                                message,
-                                self.__last_prompt,
-                                final_results,
-                                parent_export=llm_parent_export,
-                            )
-                            tool_tracker.start_tool_spans(message, llm_tracker.current_span_export)
-                            if final_content:
-                                if (
-                                    extended_existing_span
-                                    and final_results
-                                    and final_results[-1].get("role") == "assistant"
-                                ):
-                                    final_results[-1] = final_content
-                                else:
-                                    final_results.append(final_content)
-                        elif message_type == MessageClassName.USER:
-                            tool_tracker.finish_tool_spans(message)
-                            has_tool_results = False
-                            user_parent = getattr(message, "parent_tool_use_id", None)
-                            if hasattr(message, "content"):
-                                has_tool_results = any(
-                                    type(block).__name__ == BlockClassName.TOOL_RESULT for block in message.content
-                                )
-                                content = _serialize_content_blocks(message.content)
-                                final_results.append({"content": content, "role": "user"})
-                            if has_tool_results:
-                                llm_tracker.mark_next_llm_start(user_parent)
-                        elif message_type == MessageClassName.RESULT:
-                            llm_tracker.set_context(None)
-                            if hasattr(message, "usage"):
-                                usage_metrics = _extract_usage_from_result_message(message)
-                                llm_tracker.log_usage(usage_metrics)
-
-                            result_metadata = {
-                                k: v
-                                for k, v in {
-                                    "num_turns": getattr(message, "num_turns", None),
-                                    "session_id": getattr(message, "session_id", None),
-                                }.items()
-                                if v is not None
-                            }
-                            span.log(metadata=result_metadata)
-                        elif message_type in SYSTEM_MESSAGE_TYPES:
-                            task_event_span_tracker.process(message)
-                            task_events.append(_serialize_system_message(message))
-
+                        context_tracker.add(message)
                         yield message
                 except asyncio.CancelledError:
                     # The CancelledError may come from the subprocess transport
@@ -1168,19 +850,12 @@ async def receive_response(self) -> AsyncGenerator[Any, None]:
                     # the response stream ends cleanly. If the caller genuinely
                     # cancelled the task, they still have pending cancellation
                     # requests that will fire at their next await point.
-                    if final_results:
-                        span.log(output=final_results[-1])
+                    context_tracker.log_output()
                 else:
-                    if final_results:
-                        span.log(output=final_results[-1])
+                    context_tracker.log_output()
                 finally:
-                    if task_events:
-                        span.log(metadata={"task_events": task_events})
-                    task_event_span_tracker.cleanup()
-                    tool_tracker.cleanup()
-                    llm_tracker.cleanup()
-                    if hasattr(_thread_local, "tool_span_tracker"):
-                        delattr(_thread_local, "tool_span_tracker")
+                    context_tracker.log_tasks()
+                    context_tracker.cleanup()
 
         async def __aenter__(self) -> "WrappedClaudeSDKClient":
             await self.__client.__aenter__()
diff --git a/py/src/braintrust/wrappers/claude_agent_sdk/test_wrapper.py b/py/src/braintrust/wrappers/claude_agent_sdk/test_wrapper.py
index 1ea93d84..52dcdeb4 100644
--- a/py/src/braintrust/wrappers/claude_agent_sdk/test_wrapper.py
+++ b/py/src/braintrust/wrappers/claude_agent_sdk/test_wrapper.py
@@ -883,7 +883,7 @@ async def test_relay_user_messages_between_parallel_agent_calls_do_not_split_llm
 async def test_agent_tool_spans_encapsulate_child_task_spans(memory_logger):
     """Agent TOOL spans must end after their child TASK spans, not before.
 
-    The mid-stream tool_tracker.cleanup() in the AssistantMessage handler must
+    The mid-stream tool_tracker.cleanup_context() in the AssistantMessage handler must
     not close Agent TOOL spans that still have active child TASK spans. Those
     Agent TOOL spans should only close when their ToolResult arrives.
     """
@@ -1427,7 +1427,7 @@ def test_tool_span_tracker_cleanup_closes_unmatched_spans(memory_logger):
             AssistantMessage(content=[ToolUseBlock(id="call-dangling", name="weather", input={"city": "Toronto"})]),
             llm_span.export(),
         )
-        tracker.cleanup()
+        tracker.cleanup_all()
         llm_span.end()
 
     spans = memory_logger.pop()
@@ -1709,7 +1709,7 @@ async def calculator_handler(args):
             )
         finally:
             _clear_tool_span_tracker()
-            tracker.cleanup()
+            tracker.cleanup_all()
             llm_span.end()
 
     assert result == {"content": [{"type": "text", "text": "42"}]}
@@ -1770,7 +1770,7 @@ async def calculator_handler(args):
             )
         finally:
             _clear_tool_span_tracker()
-            tracker.cleanup()
+            tracker.cleanup_all()
             llm_span.end()
 
     spans = memory_logger.pop()
@@ -2295,7 +2295,7 @@ async def echo_handler(args):
             )
         finally:
             _clear_tool_span_tracker()
-            tracker.cleanup()
+            tracker.cleanup_all()
             alpha_llm.end()
             beta_llm.end()
 
@@ -2382,7 +2382,7 @@ def test_dispatch_queue_assigns_identical_tool_spans_in_fifo_order(memory_logger
         # Cleanup
         first.release()
         second.release()
-        tracker.cleanup()
+        tracker.cleanup_all()
         llm_alpha.end()
         llm_beta.end()
 

From cc1ca3a6355e71040e24bee14934269924b02fb8 Mon Sep 17 00:00:00 2001
From: Abhijeet Prasad <abhijeet@braintrustdata.com>
Date: Thu, 19 Mar 2026 00:12:54 -0400
Subject: [PATCH 08/12] Update PLAN.md: mark Step 5 as done, note
 llm_parent_export retention

---
 .../wrappers/claude_agent_sdk/PLAN.md         | 60 +++++++++----------
 1 file changed, 27 insertions(+), 33 deletions(-)

diff --git a/py/src/braintrust/wrappers/claude_agent_sdk/PLAN.md b/py/src/braintrust/wrappers/claude_agent_sdk/PLAN.md
index f88a4014..6b5609bd 100644
--- a/py/src/braintrust/wrappers/claude_agent_sdk/PLAN.md
+++ b/py/src/braintrust/wrappers/claude_agent_sdk/PLAN.md
@@ -21,18 +21,22 @@ orchestrator).
 @dataclasses.dataclass
 class _AgentContext:
     llm_span: Any | None = None          # current open LLM span
+    llm_parent_export: str | None = None # parent of current LLM span (merge guard)
     llm_output: list[dict[str, Any]] | None = None  # accumulated output for merge path
     next_llm_start: float | None = None  # timestamp from tool results
     task_span: Any | None = None         # TASK span for this subagent
     task_confirmed: bool = False         # True after TaskStartedMessage
 ```
 
-Three fields dropped vs the old trackers:
+Two fields dropped vs the old trackers:
 - `llm_span_export` → derived: `ctx.llm_span.export() if ctx.llm_span else None`
-- `llm_parent_export` → redundant: consecutive merges only happen in the
-  orchestrator context where the parent never changes (see SIMPLIFICATION.md §3b)
 - `task_id` → written to metadata at creation, never read back
 
+`llm_parent_export` was retained (originally planned for removal) because it
+guards against incorrect merges when a subagent `AssistantMessage` with
+`parent_tool_use_id=None` follows an orchestrator `AssistantMessage` — the
+resolved parent changes but `next_llm_start` is still `None`.
+
 ### `ContextTracker` — public API
 
 ```python
@@ -530,41 +534,31 @@ module-level functions. `TaskEventSpanTracker._span_name` / `._metadata` /
 
 Done. Added both methods. Existing `cleanup()` left untouched.
 
-### Step 3 — Migrate mid-stream cleanup call in `receive_response`
-
-Replace the mid-stream `tool_tracker.cleanup(end_time=..., exclude_...,
-only_...)` call with `tool_tracker.cleanup_context(...)`. Simplify old
-`cleanup()` to delegate to `cleanup_all(end_time)`. Remove `_UNSET_PARENT` from
-`cleanup()`'s signature (the sentinel itself stays — `LLMSpanTracker` still
-uses it).
-
-**Dependencies:** Step 2.
+### Step 3 ✅ — Migrate mid-stream cleanup call in `receive_response`
 
-### Step 4 — Add `_AgentContext` and `ContextTracker`
+Done. Mid-stream call now uses `cleanup_context()`. Old `cleanup()` delegates
+to `cleanup_all()`. Two unit tests updated to use `cleanup_context()` directly.
 
-Implement the full `ContextTracker` class (dead code — not wired in yet).
+### Step 4 ✅ — Add `_AgentContext` and `ContextTracker`
 
-**Dependencies:** Steps 1 + 2.
+Done. Full `ContextTracker` class implemented (dead code — not wired in yet).
 
-### Step 5 — Wire `ContextTracker` into `receive_response`; delete old classes
+### Step 5 ✅ — Wire `ContextTracker` into `receive_response`; delete old classes
 
-- Rewrite `receive_response` to use `ContextTracker`.
-- Delete `LLMSpanTracker`, `TaskEventSpanTracker`, `_UNSET_PARENT`.
-- From `ToolSpanTracker`: remove `_pending_task_link_tool_use_ids` field +
-  property, `mark_task_started()`, the discard in `_end_tool_span` and
-  `start_tool_spans`, and old `cleanup()`.
+Done. Rewrote `receive_response` to use `ContextTracker`. Deleted
+`LLMSpanTracker`, `TaskEventSpanTracker`, `_UNSET_PARENT`. Cleaned up
+`ToolSpanTracker` (removed pending-task-link bookkeeping and old `cleanup()`).
 
-**Dependencies:** Steps 3 + 4.
+**Implementation note:** `llm_parent_export` was retained on `_AgentContext`
+(contrary to the original plan's §1b which proposed dropping it). Testing
+revealed it's needed when a subagent `AssistantMessage` arrives with
+`parent_tool_use_id=None` right after an orchestrator `AssistantMessage` — the
+parent export changes (root → task span) but `next_llm_start` is still `None`,
+so without the guard the two messages would incorrectly merge.
 
-### Dependency graph
+---
 
-```
-Step 0 (done)
-  │
-  ├─► Step 1 (extract helpers)       ─┐
-  │                                    ├─► Step 4 (ContextTracker) ─► Step 5 (wire + delete)
-  ├─► Step 2 (add cleanup methods)  ──┤
-  │                                    │
-  └─► Step 3 (migrate cleanup call) ──┘
-           ↑ depends on Step 2
-```
+All steps complete. The three-tracker architecture (`LLMSpanTracker` +
+`TaskEventSpanTracker` + `ToolSpanTracker`) has been replaced with two
+(`ContextTracker` + `ToolSpanTracker`), with `ContextTracker` owning the
+`ToolSpanTracker` as a private component.

From 9fd7f3d7920a6fc3365c8d350031809db566500d Mon Sep 17 00:00:00 2001
From: Abhijeet Prasad <abhijeet@braintrustdata.com>
Date: Thu, 19 Mar 2026 00:14:56 -0400
Subject: [PATCH 09/12] Clean up stale comments and remove dead code

- Remove _log_tracing_warning (unused after tracker consolidation)
- Remove unused logging import and log variable
- Update _create_llm_span_for_messages docstring: remove stale
  references to catch_exceptions block and ambient span nesting
  (now uses explicit parent export)
- Simplify receive_response docstring (tracing is via ContextTracker)
---
 .../wrappers/claude_agent_sdk/_wrapper.py      | 18 ++----------------
 1 file changed, 2 insertions(+), 16 deletions(-)

diff --git a/py/src/braintrust/wrappers/claude_agent_sdk/_wrapper.py b/py/src/braintrust/wrappers/claude_agent_sdk/_wrapper.py
index c37745d3..68da7a1a 100644
--- a/py/src/braintrust/wrappers/claude_agent_sdk/_wrapper.py
+++ b/py/src/braintrust/wrappers/claude_agent_sdk/_wrapper.py
@@ -2,7 +2,6 @@
 import collections
 import dataclasses
 import json
-import logging
 import threading
 import time
 from collections.abc import AsyncGenerator, AsyncIterable
@@ -27,7 +26,6 @@
 )
 
 
-log = logging.getLogger(__name__)
 _thread_local = threading.local()
 
 
@@ -83,10 +81,6 @@ def release(self) -> None:
 _NOOP_ACTIVE_TOOL_SPAN = _NoopActiveToolSpan()
 
 
-def _log_tracing_warning(exc: Exception) -> None:
-    log.warning("Error in tracing code", exc_info=exc)
-
-
 def _parse_tool_name(tool_name: Any) -> ParsedToolName:
     raw_name = str(tool_name) if tool_name is not None else DEFAULT_TOOL_NAME
 
@@ -808,13 +802,7 @@ async def capturing_wrapper() -> AsyncGenerator[dict[str, Any], None]:
             return await self.__client.query(*args, **kwargs)
 
         async def receive_response(self) -> AsyncGenerator[Any, None]:
-            """Wrap receive_response to add tracing.
-
-            Uses start_span context manager which automatically:
-            - Handles exceptions and logs them as errors
-            - Sets the span as current so tool calls automatically nest under it
-            - Manages span lifecycle (start/end)
-            """
+            """Wrap receive_response to add tracing via ContextTracker."""
             generator = self.__client.receive_response()
 
             # Determine the initial input - may be updated later if using async generator
@@ -880,9 +868,7 @@ def _create_llm_span_for_messages(
     - final_content: The final message content to add to conversation history
     - span: The LLM span object (for logging metrics later)
 
-    Automatically nests under the current span (TASK span from receive_response).
-
-    Note: This is called from within a catch_exceptions block, so errors won't break user code.
+    Called by ContextTracker._start_or_merge_llm_span with an explicit parent export.
     """
     if not messages:
         return None, None

From 8a405d9169800dc4cf28f5f4b9e86cd68091bf7b Mon Sep 17 00:00:00 2001
From: Abhijeet Prasad <abhijeet@braintrustdata.com>
Date: Thu, 19 Mar 2026 00:19:36 -0400
Subject: [PATCH 10/12] delete plans

moved to https://gist.github.com/AbhiPrasad/28baa70846188d2ff0ce388d8166ec36
---
 .../claude_agent_sdk/INSTRUMENTATION.md       | 532 -----------------
 .../wrappers/claude_agent_sdk/PLAN.md         | 564 ------------------
 .../claude_agent_sdk/SIMPLIFICATION.md        | 366 ------------
 3 files changed, 1462 deletions(-)
 delete mode 100644 py/src/braintrust/wrappers/claude_agent_sdk/INSTRUMENTATION.md
 delete mode 100644 py/src/braintrust/wrappers/claude_agent_sdk/PLAN.md
 delete mode 100644 py/src/braintrust/wrappers/claude_agent_sdk/SIMPLIFICATION.md

diff --git a/py/src/braintrust/wrappers/claude_agent_sdk/INSTRUMENTATION.md b/py/src/braintrust/wrappers/claude_agent_sdk/INSTRUMENTATION.md
deleted file mode 100644
index 1a7e0b10..00000000
--- a/py/src/braintrust/wrappers/claude_agent_sdk/INSTRUMENTATION.md
+++ /dev/null
@@ -1,532 +0,0 @@
-# Claude Agent SDK Instrumentation — Deep Dive
-
-## Overview
-
-This document explains how the Braintrust wrapper instruments the Claude Agent SDK: how the monkeypatch works, what data structures are used, and how they collaborate to produce a correct span tree even when multiple subagents run concurrently on a single interleaved message stream.
-
----
-
-## 1. The Monkeypatch
-
-`setup_claude_agent_sdk()` (in `__init__.py`) patches three things in the `claude_agent_sdk` module **and** in every already-imported module in `sys.modules`:
-
-```
-claude_agent_sdk.ClaudeSDKClient   →  WrappedClaudeSDKClient   (via _create_client_wrapper_class)
-claude_agent_sdk.SdkMcpTool        →  WrappedSdkMcpTool        (via _create_tool_wrapper_class)
-claude_agent_sdk.tool              →  wrapped_tool_fn           (via _wrap_tool_factory)
-```
-
-All three wrappers are **generated at call time** via factory functions — they dynamically create new classes/functions that subclass or close over the originals. The `sys.modules` sweep handles the case where user code has already done `from claude_agent_sdk import ClaudeSDKClient` before calling `setup_claude_agent_sdk`.
-
-```
-User Code                       Braintrust Wrapper              Original SDK
-─────────                       ──────────────────              ────────────
-ClaudeSDKClient(...)       →    WrappedClaudeSDKClient(...)  →  original.__init__(...)
-  client.query(...)         →    captures prompt + start_time →  original.query(...)
-  client.receive_response() →    starts TASK span             →  original.receive_response()
-                                 processes every message
-                                 creates LLM/TOOL spans
-                                 yields message to user
-```
-
-`WrappedClaudeSDKClient` extends `Wrapper` (a base that proxies attribute access to the inner client), so any attributes the user accesses that aren't explicitly overridden fall through transparently to the original.
-
-### 1b. Why `SdkMcpTool` and `tool` are wrapped separately from `ClaudeSDKClient`
-
-`ClaudeSDKClient` is responsible for the **stream side**: it observes every message,
-creates TOOL spans for each `ToolUseBlock`, and stores them in `ToolSpanTracker`.
-At that point the spans exist but are **not yet the active context** — the tool
-handler hasn't run yet.
-
-`SdkMcpTool` and `tool` are responsible for the **handler side**: they intercept
-tool handler registration at decoration/instantiation time and wrap every handler
-via `_wrap_tool_handler`. When the Claude SDK later calls the handler (through its
-own internal machinery, not Braintrust code), the wrapper fires first:
-
-```python
-async def wrapped_handler(args):
-    active_tool_span = _activate_tool_span_for_handler(tool_name, args)
-
-    if not active_tool_span.has_span:
-        # No stream active — create a standalone TOOL span as a fallback
-        with start_span(name=str(tool_name), type=TOOL, input=args) as span:
-            result = await handler(args)
-            span.log(output=result)
-            return result
-
-    try:
-        return await handler(args)     # ← user code runs here, under the span
-    except Exception as exc:
-        active_tool_span.log_error(exc)
-        raise
-    finally:
-        active_tool_span.release()     # span.unset_current()
-```
-
-`_activate_tool_span_for_handler` reads the thread-local `ToolSpanTracker`, finds
-the pre-created span by `(tool_name, args)`, and calls `span.set_current()` —
-making that span the active context for the duration of the call. Any span the user
-creates *inside* their handler therefore nests under the correct TOOL span
-automatically.
-
-**The two-phase handoff in full:**
-
-```
-receive_response() — Braintrust controls          Claude SDK internals — Braintrust does NOT control
-──────────────────────────────────────            ──────────────────────────────────────────────────
-AssistantMessage arrives
-  → start_tool_spans()
-    → create TOOL span            ─── stored in ToolSpanTracker via thread-local ──→
-    → store in _active_spans
-
-                                                  SDK calls tool.handler(args)
-                                                    → _wrap_tool_handler fires
-                                                    → reads thread-local tracker
-                                                    → acquires + activates TOOL span
-                                                    → user handler runs nested under it
-                                                    → span released (unset_current)
-
-UserMessage arrives (ToolResultBlock)
-  → finish_tool_spans()
-    → log output + end span
-```
-
-**Without the `SdkMcpTool`/`tool` wrappers**, step 2 never happens. The pre-created
-spans sit in the tracker with their context never activated, and any spans created
-inside user handler code have no TOOL span parent — they would float up to the TASK
-span or be rootless.
-
-**The fallback path** (no stream active) covers two practical cases:
-- A tool handler called directly in a unit test.
-- A tool handler invoked before or after a `receive_response()` session.
-
-In both cases `_activate_tool_span_for_handler` finds no `ToolSpanTracker` on the
-thread-local and returns `_NOOP_ACTIVE_TOOL_SPAN`, triggering the `with start_span`
-fallback branch which creates and closes a standalone TOOL span for that single
-invocation.
-
----
-
-## 2. The SDK Message Stream
-
-The Claude Agent SDK streams messages from a subprocess over a JSON protocol. Every message is surfaced on a single `async for message in client.receive_response()` iterator. When subagents run concurrently, their messages **interleave** on this one stream:
-
-```
-─────── Single stream (time flows down) ────────────────────────────────────────────────
- AssistantMessage  (orchestrator: calls Agent A and Agent B)
- SystemMessage     (TaskStarted for task A)
- SystemMessage     (TaskStarted for task B)
- AssistantMessage  (subagent A's LLM turn: calls Bash)    ← parent_tool_use_id = "call-A"
- AssistantMessage  (subagent B's LLM turn: calls Read)    ← parent_tool_use_id = "call-B"
- UserMessage       (Bash result for A)                    ← parent_tool_use_id = "call-A"
- UserMessage       (Read result for B)                    ← parent_tool_use_id = "call-B"
- SystemMessage     (TaskNotification for task A — done)
- SystemMessage     (TaskNotification for task B — done)
- ResultMessage     (final usage)
-────────────────────────────────────────────────────────────────────────────────────────
-```
-
-The key field is `parent_tool_use_id`: every message from a subagent carries the `tool_use_id` of the `Agent` tool call that spawned it. Orchestrator messages have `parent_tool_use_id = None`.
-
----
-
-## 3. The Span Hierarchy Being Built
-
-```
-Claude Agent  [TASK]
-├── anthropic.messages.create  [LLM]   ← orchestrator's turn
-│   ├── Agent  [TOOL]                  ← "Agent" tool call → spawns subagent A
-│   └── Agent  [TOOL]                  ← "Agent" tool call → spawns subagent B
-├── Task A  [TASK]
-│   ├── anthropic.messages.create  [LLM]   ← subagent A turn 1
-│   │   └── Bash  [TOOL]
-│   └── anthropic.messages.create  [LLM]   ← subagent A turn 2
-│       └── Read  [TOOL]
-└── Task B  [TASK]
-    ├── anthropic.messages.create  [LLM]   ← subagent B turn 1
-    │   └── Bash  [TOOL]
-    └── anthropic.messages.create  [LLM]   ← subagent B turn 2
-        └── Read  [TOOL]
-```
-
-Three independent trackers collaborate to build this tree. They are described below.
-
----
-
-## 4. Data Structures
-
-### 4a. `ParsedToolName` (frozen dataclass)
-
-```python
-@dataclasses.dataclass(frozen=True)
-class ParsedToolName:
-    raw_name: str        # "mcp__server__remote_tool"
-    display_name: str    # "remote_tool"   (or same as raw_name for non-MCP)
-    is_mcp: bool         # True
-    mcp_server: str|None # "server"
-```
-
-MCP tools from the Claude SDK have names like `mcp__myserver__some_tool`. `_parse_tool_name()` splits on `__` to extract `server` and `some_tool`, giving the span a clean display name and storing MCP metadata.
-
----
-
-### 4b. `_ActiveToolSpan` (dataclass)
-
-One instance per live tool call. Lives in `ToolSpanTracker._active_spans` keyed by `tool_use_id`.
-
-```
-_ActiveToolSpan
-┌─────────────────────────────────────────────────────┐
-│ span              : the Braintrust span object       │
-│ raw_name          : "mcp__server__tool"              │
-│ display_name      : "tool"                           │
-│ input             : {"arg": "val"}  ← from SDK block │
-│ tool_use_id       : "toolu_abc123"                   │
-│ parent_tool_use_id: "toolu_agent_a" ← which subagent │
-│ handler_active    : False ← True while handler runs  │
-└─────────────────────────────────────────────────────┘
-```
-
-`activate()` sets `handler_active=True` and calls `span.set_current()` — making the Braintrust span the active context so any `start_span()` inside a tool handler automatically nests under it. `release()` undoes this.
-
-There is also `_NoopActiveToolSpan` — a sentinel used when no matching span is found. It has the same interface but does nothing, so `_wrap_tool_handler` can call `.activate()` / `.release()` unconditionally without null checks.
-
----
-
-### 4c. `ToolSpanTracker`
-
-This is the most complex tracker. It manages all live tool spans across all subagent contexts.
-
-```
-ToolSpanTracker
-┌───────────────────────────────────────────────────────────────────────────────┐
-│                                                                               │
-│  _active_spans: dict[tool_use_id → _ActiveToolSpan]                          │
-│  ┌──────────────┐  ┌──────────────┐  ┌──────────────┐                        │
-│  │ "toolu_a1"   │  │ "toolu_b1"   │  │ "toolu_c1"   │  ...                   │
-│  │ Bash         │  │ Bash         │  │ remote_tool  │                        │
-│  │ parent=A     │  │ parent=B     │  │ parent=C     │                        │
-│  └──────────────┘  └──────────────┘  └──────────────┘                        │
-│                                                                               │
-│  _dispatch_queues: dict[(tool_name, input_sig) → deque[tool_use_id]]         │
-│  ┌──────────────────────────────────────────────────┐                        │
-│  │ ("Bash", '{"cmd":"echo"}') → deque["a1", "b1"]   │  ← FIFO               │
-│  │ ("Read", '{"path":"/f"}')  → deque["a2", "b2"]   │                        │
-│  └──────────────────────────────────────────────────┘                        │
-│                                                                               │
-│  _pending_task_link_tool_use_ids: set[tool_use_id]                           │
-│  { "toolu_agent_a", "toolu_agent_b" }  ← "Agent" calls awaiting TaskStarted  │
-│                                                                               │
-└───────────────────────────────────────────────────────────────────────────────┘
-```
-
-**Lifecycle of a tool span through `ToolSpanTracker`:**
-
-```
-AssistantMessage arrives with ToolUseBlock
-        │
-        ▼
-  start_tool_spans()
-  ├── creates span with parent = current LLM span export
-  ├── inserts into _active_spans[tool_use_id]
-  ├── enqueues tool_use_id into _dispatch_queues[(name, input)]
-  └── if name == "Agent": adds to _pending_task_link_tool_use_ids
-
-Tool handler is called (by Claude SDK)
-        │
-        ▼
-  _activate_tool_span_for_handler()
-  ├── reads _thread_local.tool_span_tracker
-  └── calls tracker.acquire_span_for_handler(name, args)
-      ├── find candidates: active spans with matching name, not handler_active
-      ├── _match_via_dispatch_queue()   ← try FIFO first
-      │   └── pop from deque, return matching candidate
-      ├── fallback: _match_tool_span_for_handler()  ← exact input match
-      └── matched_span.activate()  → handler_active=True, set_current()
-
-Tool handler finishes / UserMessage with ToolResultBlock arrives
-        │
-        ▼
-  finish_tool_spans()
-  └── _end_tool_span(tool_use_id, tool_result_block=block)
-      ├── pop from _active_spans
-      ├── remove from _dispatch_queues
-      ├── log output from ToolResultBlock
-      └── span.end()
-```
-
-**`_dispatch_queues` — the FIFO disambiguator:**
-
-When subagent A and subagent B both call `Bash` with `{"cmd": "echo hi"}`, two identical `_ActiveToolSpan` entries exist. Without disambiguation, `acquire_span_for_handler` can't tell which handler invocation should own which span. The dispatch queue solves this by recording creation order:
-
-```
-Creation order:         Queue state:
-  span "bash-A" added   →  ("Bash", '{"cmd":"echo hi"}')  deque: ["bash-A"]
-  span "bash-B" added   →  ("Bash", '{"cmd":"echo hi"}')  deque: ["bash-A", "bash-B"]
-
-Handler for A fires:    pop "bash-A"  →  give it bash-A span  ✓
-Handler for B fires:    pop "bash-B"  →  give it bash-B span  ✓
-```
-
-**`cleanup()` — the scoped closer:**
-
-```python
-def cleanup(self, end_time=None, exclude_tool_use_ids=None, only_parent_tool_use_id=_UNSET_PARENT)
-```
-
-Three filter modes:
-- No filters → close all active spans (called at the very end of `receive_response`).
-- `exclude_tool_use_ids` → skip "Agent" spans still waiting for their `TaskStarted` event.
-- `only_parent_tool_use_id` → **only** close spans belonging to a specific subagent context. This is called every time an `AssistantMessage` arrives, scoped to that message's `parent_tool_use_id`, so it never accidentally closes another subagent's still-open tool spans.
-
----
-
-### 4d. `LLMSpanTracker._SubagentState` (inner dataclass)
-
-One per subagent context. `None` key = orchestrator.
-
-```
-_SubagentState
-┌──────────────────────────────────────────────────────────────────┐
-│ current_span        : the open LLM span (or None)                │
-│ current_span_export : span.export() string for use as parent ref  │
-│ current_parent_export: parent export used when span was created   │
-│ current_output      : [{"role":"assistant","content":[...]}]      │
-│                       accumulated output so streaming chunks merge│
-│ next_start_time     : float timestamp — when the next LLM call    │
-│                       will start (set after tool results arrive)  │
-└──────────────────────────────────────────────────────────────────┘
-```
-
-`next_start_time` is the key to non-overlapping sequential spans within one subagent. The sequence is:
-
-```
-UserMessage (tool results arrive)
-    → mark_next_llm_start()  ← stamps the time NOW
-
-AssistantMessage (next LLM response)
-    → start_llm_span()
-        → resolved_start_time = next_start_time  (the stamp from above)
-        → current_span.end(end_time=resolved_start_time)  ← previous span ends HERE
-        → create new span with start = resolved_start_time
-        → next_start_time = None
-```
-
-This ensures the outgoing LLM span ends exactly when the next one begins — no gap, no overlap — even though the Python code observing the stream sees them arrive sequentially.
-
----
-
-### 4e. `LLMSpanTracker`
-
-Manages a `_SubagentState` for every subagent context, plus an `_active_context` pointer that says "which state should the next operation touch":
-
-```
-LLMSpanTracker
-┌───────────────────────────────────────────────────────────────────────────────┐
-│                                                                               │
-│  _active_context: "call-A"   ← set by set_context() on each AssistantMessage │
-│                                                                               │
-│  _states: dict[parent_tool_use_id → _SubagentState]                          │
-│  ┌──────────────────┐  ┌──────────────────┐  ┌──────────────────┐            │
-│  │ None (orchestr.) │  │ "call-A"         │  │ "call-B"         │            │
-│  │ next_start=t0    │  │ current_span=s1  │  │ current_span=s2  │            │
-│  │ current_span=s0  │  │ next_start=None  │  │ next_start=t1    │            │
-│  └──────────────────┘  └──────────────────┘  └──────────────────┘            │
-│                                                                               │
-└───────────────────────────────────────────────────────────────────────────────┘
-```
-
-**Context routing via `_get_state`:**
-
-```python
-def _get_state(self, parent_tool_use_id=_UNSET_PARENT):
-    key = self._active_context if parent_tool_use_id is _UNSET_PARENT else parent_tool_use_id
-    ...
-```
-
-- Called with `_UNSET_PARENT` (the default) → uses `_active_context`, whichever subagent was most recently set via `set_context()`.
-- Called with an explicit value (e.g. from `mark_next_llm_start(user_parent)`) → routes directly to that subagent's state regardless of `_active_context`.
-
-This is why `_UNSET_PARENT = object()` exists — it is a sentinel that can be distinguished from `None`, which is a valid key meaning "orchestrator".
-
-**`mark_next_llm_start()` edge case:**
-
-UserMessages from the Claude SDK sometimes don't carry `parent_tool_use_id` even when they belong to a subagent context. The special-case logic handles this:
-
-```python
-def mark_next_llm_start(self, parent_tool_use_id=_UNSET_PARENT):
-    if parent_tool_use_id is None and self._active_context is not None:
-        parent_tool_use_id = _UNSET_PARENT  # fall back to active context
-    self._get_state(parent_tool_use_id).next_start_time = time.time()
-```
-
-If the UserMessage says `parent_tool_use_id=None` (field absent or None) but `_active_context` is set (we are processing a subagent's turn), treat it as "active context" rather than routing to the orchestrator state.
-
----
-
-### 4f. `TaskEventSpanTracker`
-
-Manages TASK spans for subagent tasks, driven by `SystemMessage` subtypes.
-
-```
-TaskEventSpanTracker
-┌───────────────────────────────────────────────────────────────────────────────┐
-│ _root_span_export        : export of the top-level "Claude Agent" TASK span  │
-│ _tool_tracker            : ref to ToolSpanTracker (to get Agent span export) │
-│                                                                               │
-│ _active_spans            : dict[task_id → span]                              │
-│ ┌──────────────┐  ┌──────────────┐                                           │
-│ │ "task_a"     │  │ "task_b"     │  ...                                      │
-│ │ span=...     │  │ span=...     │                                            │
-│ └──────────────┘  └──────────────┘                                           │
-│                                                                               │
-│ _task_span_by_tool_use_id: dict[agent_tool_use_id → span]                    │
-│ ┌─────────────────────────────────────────────────────┐                      │
-│ │ "toolu_agent_a" → task-A span                       │                      │
-│ │ "toolu_agent_b" → task-B span                       │                      │
-│ └─────────────────────────────────────────────────────┘                      │
-│                                                                               │
-│ _active_task_order       : ["task_a", "task_b"]  ← insertion order          │
-└───────────────────────────────────────────────────────────────────────────────┘
-```
-
-**Lifecycle:**
-
-- `TaskStartedMessage` → create a TASK span. Parent is the `Agent` tool span for this task (looked up via `_tool_tracker.get_span_export(message.tool_use_id)`), falling back to the root span. Also calls `_tool_tracker.mark_task_started(tool_use_id)`, removing the agent tool_use_id from `_pending_task_link_tool_use_ids`, which tells `cleanup()` it is now safe to close that `Agent` span.
-- `TaskProgressMessage` → log metadata/output updates to the existing TASK span.
-- `TaskNotificationMessage` → end the TASK span and remove it from both dicts.
-
-**`parent_export_for_message()`** finds the right parent for a subagent's LLM span given an `AssistantMessage`:
-
-1. If `parent_tool_use_id` is set, look up `_task_span_by_tool_use_id[parent_tool_use_id]` — return that task span as parent. ✓
-2. Else if the message itself contains an `Agent` ToolUseBlock (orchestrator calling a subagent), use the top-level span as parent (not the most recently opened task).
-3. Else fall back to the latest open task span in `_active_task_order`.
-
----
-
-## 5. Thread-Local: Bridging the Stream to Tool Handlers
-
-The trickiest part is that tool handlers are called **by the Claude SDK** — not directly by Braintrust code. There is no way to pass context as a function argument. The solution is a thread-local:
-
-```python
-_thread_local = threading.local()
-```
-
-At the start of `receive_response()`:
-```python
-_thread_local.tool_span_tracker = tool_tracker
-```
-
-Inside every wrapped tool handler:
-```python
-def _activate_tool_span_for_handler(tool_name, args):
-    tool_span_tracker = getattr(_thread_local, "tool_span_tracker", None)
-    if tool_span_tracker is None:
-        return _NOOP_ACTIVE_TOOL_SPAN  # no tracing session active
-    return tool_span_tracker.acquire_span_for_handler(tool_name, args) or _NOOP_ACTIVE_TOOL_SPAN
-```
-
-This means:
-- One `receive_response()` session running on a thread → that thread's tool handlers find their tracker.
-- If a tool is called outside of a `receive_response()` session → returns `_NOOP_ACTIVE_TOOL_SPAN`, tracing is skipped gracefully.
-- The thread-local is cleaned up in the `finally` block of `receive_response()`.
-
-```
-Thread T1: receive_response() starts
-  _thread_local.tool_span_tracker = tracker_T1
-
-  Claude SDK calls tool handler "Bash" on T1
-    → _activate_tool_span_for_handler reads _thread_local.tool_span_tracker
-    → gets tracker_T1 → acquires correct span → handler runs nested under it
-
-  receive_response() finally:
-    del _thread_local.tool_span_tracker
-```
-
----
-
-## 6. Full Message Loop
-
-How every message type affects each tracker:
-
-```
-Message arrives from SDK
-│
-├── AssistantMessage  (parent_tool_use_id = X)
-│   ├── llm_tracker.set_context(X)              → route all LLM ops to subagent X's state
-│   ├── if current LLM span + active tool spans:
-│   │   tool_tracker.cleanup(                   → close only X's dangling tool spans
-│   │       end_time=next_start_time,           → timed to the gap before this LLM span
-│   │       exclude=active_subagent_ids,        → leave "Agent" spans still open
-│   │       only_parent=X)                      → don't touch other subagents' tool spans
-│   ├── task_event_span_tracker
-│   │       .parent_export_for_message()        → find which TASK span is the parent
-│   ├── llm_tracker.start_llm_span(...)         → end previous span for X; start new one
-│   └── tool_tracker.start_tool_spans(...)      → open tool spans for any ToolUseBlocks
-│
-├── UserMessage  (parent_tool_use_id = X)
-│   ├── tool_tracker.finish_tool_spans(...)     → close tool spans with output from ToolResultBlocks
-│   └── if has_tool_results:
-│       llm_tracker.mark_next_llm_start(X)     → stamp "next LLM for X starts now"
-│
-├── ResultMessage
-│   ├── llm_tracker.set_context(None)           → route to orchestrator state
-│   └── llm_tracker.log_usage(...)              → attach token usage to orchestrator LLM span
-│
-└── SystemMessage / TaskStarted / TaskProgress / TaskNotification
-    └── task_event_span_tracker.process(...)    → create / update / end TASK spans
-```
-
----
-
-## 7. End-to-End Example: Two Concurrent Subagents
-
-Walkthrough of exactly what the three trackers look like at each step for the `test_interleaved_subagent_tool_output_preserved` scenario:
-
-```
-Stream event                         ToolSpanTracker._active_spans    LLMTracker._states           TaskEventSpanTracker
-────────────────────────────────     ────────────────────────────     ─────────────────────        ────────────────────
-[1] AssistantMessage(parent=None)    {}                               {None: {span=LLM-0}}         {}
-    orchestrator calls Agent(α), Agent(β)
-    after start_tool_spans:
-                                     {"call-α": Agent-span-α,
-                                      "call-β": Agent-span-β}
-                                     pending: {"call-α", "call-β"}
-
-[2] TaskStartedMessage(task=alpha)   pending: {"call-β"}             (unchanged)                  {"alpha": Task-α (parent=Agent-α)}
-[3] TaskStartedMessage(task=beta)    pending: {}                     (unchanged)                  {"alpha": Task-α, "beta": Task-β}
-
-[4] AssistantMessage(parent=call-α)  (subagent alpha's LLM turn: Bash call)
-    set_context("call-α")
-    cleanup(only_parent="call-α")    → closes nothing (α has no old tool spans)
-    start_llm_span                   (unchanged)                     {"call-α": {span=LLM-α}}
-    start_tool_spans("bash-1")       {"call-α": Bash-span (parent=LLM-α),
-                                      "call-β": Agent-span-β}
-
-[5] AssistantMessage(parent=call-β)  (subagent beta's LLM turn: Read call)
-    set_context("call-β")
-    cleanup(only_parent="call-β")    → closes nothing (β has no old tool spans)
-                                     Bash-span is NOT closed ← key fix
-    start_llm_span                   (unchanged)                     {"call-β": {span=LLM-β}}
-    start_tool_spans("read-1")       {"call-α": Bash-span (still open!),
-                                      "call-β": Read-span (parent=LLM-β)}
-
-[6] UserMessage(ToolResult bash-1="alpha_file_contents", parent=call-α)
-    finish_tool_spans                Bash-span.log(output), Bash-span.end()
-    mark_next_llm_start("call-α")                                    {call-α: {next_start=now}}
-
-[7] UserMessage(ToolResult read-1="beta_file_contents", parent=call-β)
-    finish_tool_spans                Read-span.log(output), Read-span.end()
-    mark_next_llm_start("call-β")                                    {call-β: {next_start=now}}
-
-[8] ResultMessage
-    set_context(None)
-    log_usage                                                         LLM-0.log(tokens)
-
-finally:
-    task_event_span_tracker.cleanup()  → end Task-α, Task-β
-    tool_tracker.cleanup()             → end Agent-α, Agent-β (if still open)
-    llm_tracker.cleanup()              → end LLM-α, LLM-β, LLM-0
-```
-
-At step [5], the old code called `cleanup()` globally, ending Bash-span before step [6] could record its output. The `only_parent_tool_use_id="call-β"` filter introduced by the fix prevents that — Bash-span survives to receive its result.
diff --git a/py/src/braintrust/wrappers/claude_agent_sdk/PLAN.md b/py/src/braintrust/wrappers/claude_agent_sdk/PLAN.md
deleted file mode 100644
index 6b5609bd..00000000
--- a/py/src/braintrust/wrappers/claude_agent_sdk/PLAN.md
+++ /dev/null
@@ -1,564 +0,0 @@
-# Simplification Plan: Claude Agent SDK Instrumentation
-
-Replace `LLMSpanTracker` + `TaskEventSpanTracker` with a single `ContextTracker`
-class that consumes the raw SDK message stream and owns all span bookkeeping.
-See `SIMPLIFICATION.md` for the full rationale.
-
-**Unchanged:** `WrappedSdkMcpTool`, `_wrap_tool_handler`,
-`_activate_tool_span_for_handler`, `_thread_local`, `_dispatch_queues`,
-`next_llm_start` stamping, test cassettes.
-
----
-
-## Target Design
-
-### `_AgentContext`
-
-One instance per subagent context, keyed by `parent_tool_use_id` (`None` =
-orchestrator).
-
-```python
-@dataclasses.dataclass
-class _AgentContext:
-    llm_span: Any | None = None          # current open LLM span
-    llm_parent_export: str | None = None # parent of current LLM span (merge guard)
-    llm_output: list[dict[str, Any]] | None = None  # accumulated output for merge path
-    next_llm_start: float | None = None  # timestamp from tool results
-    task_span: Any | None = None         # TASK span for this subagent
-    task_confirmed: bool = False         # True after TaskStartedMessage
-```
-
-Two fields dropped vs the old trackers:
-- `llm_span_export` → derived: `ctx.llm_span.export() if ctx.llm_span else None`
-- `task_id` → written to metadata at creation, never read back
-
-`llm_parent_export` was retained (originally planned for removal) because it
-guards against incorrect merges when a subagent `AssistantMessage` with
-`parent_tool_use_id=None` follows an orchestrator `AssistantMessage` — the
-resolved parent changes but `next_llm_start` is still `None`.
-
-### `ContextTracker` — public API
-
-```python
-class ContextTracker:
-    def __init__(self, root_span, prompt, query_start_time=None):
-        self._root_span = root_span
-        self._root_span_export = root_span.export()
-        self._prompt = prompt
-        self._tool_tracker = ToolSpanTracker()        # private, also set on _thread_local
-        self._contexts: dict[str | None, _AgentContext] = {
-            None: _AgentContext(next_llm_start=query_start_time)
-        }
-        self._active_key: str | None = None           # most recent parent_tool_use_id
-        self._task_order: list[str | None] = []       # insertion-order for parent fallback
-        self._final_results: list[dict[str, Any]] = []
-        self._task_events: list[dict[str, Any]] = []
-        _thread_local.tool_span_tracker = self._tool_tracker
-
-    def add(self, message) -> None:
-        """Dispatch one SDK message to the appropriate handler."""
-        message_type = type(message).__name__
-        if message_type == MessageClassName.ASSISTANT:
-            self._handle_assistant(message)
-        elif message_type == MessageClassName.USER:
-            self._handle_user(message)
-        elif message_type == MessageClassName.RESULT:
-            self._handle_result(message)
-        elif message_type in SYSTEM_MESSAGE_TYPES:
-            self._handle_system(message)
-
-    def log_output(self) -> None:
-        if self._final_results:
-            self._root_span.log(output=self._final_results[-1])
-
-    def log_tasks(self) -> None:
-        if self._task_events:
-            self._root_span.log(metadata={"task_events": self._task_events})
-
-    def cleanup(self) -> None:
-        for ctx in self._contexts.values():
-            if ctx.llm_span:
-                ctx.llm_span.end()
-                ctx.llm_span = None
-            if ctx.task_span:
-                ctx.task_span.end()
-                ctx.task_span = None
-        self._task_order.clear()
-        self._tool_tracker.cleanup_all()
-        if hasattr(_thread_local, "tool_span_tracker"):
-            delattr(_thread_local, "tool_span_tracker")
-```
-
-### `ContextTracker` — internal handlers
-
-#### `_handle_assistant`
-
-Called on each `AssistantMessage`. This is the most complex handler because it
-orchestrates tool cleanup, LLM span creation/merge, tool span creation, and
-agent context pre-registration — all scoped to the correct subagent context.
-
-Corresponds to the `AssistantMessage` branch of the current `receive_response`
-loop, which coordinates across all three old trackers.
-
-```python
-def _handle_assistant(self, message: Any) -> None:
-    incoming_parent = getattr(message, "parent_tool_use_id", None)
-    self._active_key = incoming_parent
-    ctx = self._get_context(incoming_parent)
-
-    # 1. Close dangling tool spans from the previous turn in this context.
-    #    Skip Agent tool spans that are still live (pending or task running).
-    #    Replaces: tool_tracker.cleanup(end_time=..., exclude_tool_use_ids=...,
-    #              only_parent_tool_use_id=...)
-    if ctx.llm_span and self._tool_tracker.has_active_spans:
-        self._tool_tracker.cleanup_context(
-            incoming_parent,
-            end_time=ctx.next_llm_start or time.time(),
-            exclude_ids=self._live_agent_tool_use_ids(),
-        )
-
-    # 2. Resolve LLM span parent, then create or merge.
-    #    Replaces: task_event_span_tracker.parent_export_for_message(...)
-    #              + llm_tracker.start_llm_span(...)
-    parent_export = self._llm_parent_for_message(message)
-    final_content, extended = self._start_or_merge_llm_span(message, parent_export, ctx)
-
-    # 3. Open TOOL spans for tool calls in this message (parent = LLM span).
-    #    Replaces: tool_tracker.start_tool_spans(message, llm_tracker.current_span_export)
-    llm_export = ctx.llm_span.export() if ctx.llm_span else None
-    self._tool_tracker.start_tool_spans(message, llm_export)
-
-    # 4. Pre-create contexts for Agent tool calls so cleanup_context will
-    #    skip them before their TaskStartedMessage arrives.
-    #    Replaces: tool_tracker._pending_task_link_tool_use_ids.add(...)
-    self._register_pending_agent_contexts(message)
-
-    # 5. Accumulate conversation history.
-    if final_content:
-        if (extended
-                and self._final_results
-                and self._final_results[-1].get("role") == "assistant"):
-            self._final_results[-1] = final_content
-        else:
-            self._final_results.append(final_content)
-```
-
-#### `_handle_user`
-
-Called on each `UserMessage`. Finishes tool spans that have results, serializes
-content for conversation history, and stamps `next_llm_start` on the correct
-context.
-
-The context resolution here replaces the `_UNSET_PARENT` sentinel: if the
-`UserMessage` has no `parent_tool_use_id`, we use `_active_key` (the most
-recently seen `AssistantMessage`'s context) instead of falling back inside the
-tracker.
-
-```python
-def _handle_user(self, message: Any) -> None:
-    self._tool_tracker.finish_tool_spans(message)
-    has_tool_results = False
-    if hasattr(message, "content"):
-        has_tool_results = any(
-            type(b).__name__ == BlockClassName.TOOL_RESULT for b in message.content
-        )
-        content = _serialize_content_blocks(message.content)
-        self._final_results.append({"content": content, "role": "user"})
-    if has_tool_results:
-        user_parent = getattr(message, "parent_tool_use_id", None)
-        resolved_key = user_parent if user_parent is not None else self._active_key
-        self._get_context(resolved_key).next_llm_start = time.time()
-```
-
-#### `_handle_result`
-
-Called on `ResultMessage` (end of stream). Logs usage metrics to the
-orchestrator's LLM span and session metadata to the root span.
-
-```python
-def _handle_result(self, message: Any) -> None:
-    self._active_key = None
-    if hasattr(message, "usage"):
-        usage_metrics = _extract_usage_from_result_message(message)
-        ctx = self._get_context(None)
-        if ctx.llm_span and usage_metrics:
-            ctx.llm_span.log(metrics=usage_metrics)
-    result_metadata = {
-        k: v for k, v in {
-            "num_turns": getattr(message, "num_turns", None),
-            "session_id": getattr(message, "session_id", None),
-        }.items() if v is not None
-    }
-    if result_metadata:
-        self._root_span.log(metadata=result_metadata)
-```
-
-#### `_handle_system`
-
-Called on `SystemMessage` subtypes (TaskStarted, TaskProgress,
-TaskNotification). Resolves the Agent tool span export from `ToolSpanTracker`,
-then delegates to `_process_task_event`.
-
-This keeps `ContextTracker` and `ToolSpanTracker` loosely coupled:
-`ContextTracker` asks for the export string; `ToolSpanTracker` doesn't need a
-back-reference.
-
-```python
-def _handle_system(self, message: Any) -> None:
-    agent_span_export = self._tool_tracker.get_span_export(
-        getattr(message, "tool_use_id", None)
-    )
-    self._process_task_event(message, agent_span_export)
-    self._task_events.append(_serialize_system_message(message))
-```
-
-### `ContextTracker` — internal helpers
-
-#### `_get_context`
-
-Lazy-create `_AgentContext` instances on demand.
-
-```python
-def _get_context(self, key: str | None) -> _AgentContext:
-    ctx = self._contexts.get(key)
-    if ctx is None:
-        ctx = _AgentContext()
-        self._contexts[key] = ctx
-    return ctx
-```
-
-#### `_register_pending_agent_contexts`
-
-Pre-create an `_AgentContext` (with `task_confirmed=False`) for each Agent tool
-call in an `AssistantMessage`. This ensures `_live_agent_tool_use_ids` will
-include them, preventing `cleanup_context` from closing the Agent tool span
-before its `TaskStartedMessage` arrives.
-
-Replaces `ToolSpanTracker._pending_task_link_tool_use_ids.add()`.
-
-```python
-def _register_pending_agent_contexts(self, message: Any) -> None:
-    if not hasattr(message, "content"):
-        return
-    for block in message.content:
-        if (type(block).__name__ == BlockClassName.TOOL_USE
-                and getattr(block, "name", None) == "Agent"):
-            tool_use_id = getattr(block, "id", None)
-            if tool_use_id:
-                self._get_context(str(tool_use_id))
-```
-
-#### `_live_agent_tool_use_ids`
-
-Returns tool_use_ids of Agent spans that must not be closed yet. Includes both
-unconfirmed contexts (pending) and confirmed contexts whose task span is still
-open.
-
-Replaces the union of `task_event_span_tracker.active_tool_use_ids |
-tool_tracker.pending_task_link_tool_use_ids` in the old `receive_response`.
-
-```python
-def _live_agent_tool_use_ids(self) -> frozenset[str]:
-    result: set[str] = set()
-    for key, ctx in self._contexts.items():
-        if key is None:
-            continue
-        if not ctx.task_confirmed or ctx.task_span is not None:
-            result.add(key)
-    return frozenset(result)
-```
-
-#### `_llm_parent_for_message`
-
-Determines the parent span export for an incoming `AssistantMessage`.
-
-Replaces `TaskEventSpanTracker.parent_export_for_message()`. The logic is the
-same but reads directly from `_contexts` instead of a separate
-`_task_span_by_tool_use_id` dict.
-
-```python
-def _llm_parent_for_message(self, message: Any) -> str:
-    parent_tool_use_id = getattr(message, "parent_tool_use_id", None)
-
-    # 1. Subagent message → use that subagent's task span.
-    if parent_tool_use_id is not None:
-        ctx = self._contexts.get(str(parent_tool_use_id))
-        if ctx is not None and ctx.task_span is not None:
-            return ctx.task_span.export()
-
-    # 2. Orchestrator launching Agent tools → root span (not a task span).
-    if _message_starts_subagent_tool(message):
-        return self._root_span_export
-
-    # 3. Fallback: most recently opened task span (orchestrator messages
-    #    that arrive while a subagent task is running).
-    for key in reversed(self._task_order):
-        ctx = self._contexts.get(key)
-        if ctx is not None and ctx.task_span is not None:
-            return ctx.task_span.export()
-
-    # 4. Root span.
-    return self._root_span_export
-```
-
-#### `_start_or_merge_llm_span`
-
-Starts a new LLM span or extends the existing one via merge.
-
-**Merge path:** consecutive `AssistantMessage`s in the same context with no tool
-results between them (`ctx.next_llm_start is None`). This happens in the
-orchestrator context when the model emits a thinking block then a tool-call
-block as two separate messages. Returns `(merged_content, True)`.
-
-**New span path:** ends the previous span at `resolved_start`, opens a fresh
-one. Returns `(final_content, False)`.
-
-The `llm_parent_export` guard from `LLMSpanTracker` is dropped — see
-SIMPLIFICATION.md §3b for why it's always true in practice.
-
-```python
-def _start_or_merge_llm_span(
-    self, message: Any, parent_export: str | None, ctx: _AgentContext,
-) -> tuple[dict[str, Any] | None, bool]:
-    current_message = _serialize_assistant_message(message)
-
-    # Merge path.
-    if ctx.llm_span and ctx.next_llm_start is None and current_message is not None:
-        merged = _merge_assistant_messages(
-            ctx.llm_output[0] if ctx.llm_output else None,
-            current_message,
-        )
-        if merged is not None:
-            ctx.llm_output = [merged]
-            ctx.llm_span.log(output=ctx.llm_output)
-        return merged, True
-
-    # New span path.
-    resolved_start = ctx.next_llm_start or time.time()
-    first_token_time = time.time()
-
-    if ctx.llm_span:
-        ctx.llm_span.end(end_time=resolved_start)
-
-    final_content, span = _create_llm_span_for_messages(
-        [message], self._prompt, self._final_results,
-        parent=parent_export, start_time=resolved_start,
-    )
-    if span is not None:
-        span.log(metrics={"time_to_first_token": max(0.0, first_token_time - resolved_start)})
-    ctx.llm_span = span
-    ctx.llm_output = [final_content] if final_content is not None else None
-    ctx.next_llm_start = None
-    return final_content, False
-```
-
-#### `_process_task_event`
-
-Handles TaskStarted / TaskProgress / TaskNotification system messages.
-
-Key difference from `TaskEventSpanTracker.process()`: contexts are keyed by
-`tool_use_id` (not `task_id`), because that's the same key used everywhere else
-in `ContextTracker`. The old tracker maintained two parallel dicts
-(`_active_spans` keyed by `task_id` and `_task_span_by_tool_use_id` keyed by
-`tool_use_id`); this merges them.
-
-```python
-def _process_task_event(self, message: Any, agent_span_export: str | None) -> None:
-    task_id = getattr(message, "task_id", None)
-    if task_id is None:
-        return
-    task_id = str(task_id)
-    tool_use_id = getattr(message, "tool_use_id", None)
-    tool_use_id_str = str(tool_use_id) if tool_use_id is not None else None
-    ctx = self._get_context(tool_use_id_str)
-    message_type = type(message).__name__
-
-    if ctx.task_span is None:
-        # TaskStartedMessage — open the TASK span.
-        ctx.task_span = start_span(
-            name=_task_span_name(message, task_id),
-            span_attributes={"type": SpanTypeAttribute.TASK},
-            metadata=_task_metadata(message),
-            parent=agent_span_export or self._root_span_export,
-        )
-        ctx.task_confirmed = True
-        self._task_order.append(tool_use_id_str)
-    else:
-        # TaskProgressMessage — update existing task span.
-        update: dict[str, Any] = {}
-        metadata = _task_metadata(message)
-        if metadata:
-            update["metadata"] = metadata
-        output = _task_output(message)
-        if output is not None:
-            update["output"] = output
-        if update:
-            ctx.task_span.log(**update)
-
-    if message_type == MessageClassName.TASK_NOTIFICATION:
-        ctx.task_span.end()
-        ctx.task_span = None
-        self._task_order = [k for k in self._task_order if k != tool_use_id_str]
-```
-
-### `ToolSpanTracker` — new methods
-
-These are added alongside the existing `cleanup()`, which stays untouched until
-Step 5 deletes it.
-
-#### `cleanup_context`
-
-Closes tool spans belonging to one subagent context. Called by
-`ContextTracker._handle_assistant` before starting a new LLM span for that
-context. Skips any span whose `tool_use_id` is in `exclude_ids` (live Agent
-spans).
-
-Replaces the mid-stream `cleanup(end_time=..., exclude_tool_use_ids=...,
-only_parent_tool_use_id=...)` call.
-
-```python
-def cleanup_context(
-    self,
-    parent_tool_use_id: str | None,
-    *,
-    end_time: float | None = None,
-    exclude_ids: frozenset[str] = frozenset(),
-) -> None:
-    for tool_use_id in list(self._active_spans):
-        if tool_use_id in exclude_ids:
-            continue
-        if self._active_spans[tool_use_id].parent_tool_use_id != parent_tool_use_id:
-            continue
-        self._end_tool_span(tool_use_id, end_time=end_time)
-```
-
-#### `cleanup_all`
-
-Closes all remaining active spans. Called at end-of-stream by
-`ContextTracker.cleanup()`.
-
-Replaces the no-args `cleanup()` call in `finally:`.
-
-```python
-def cleanup_all(self, end_time: float | None = None) -> None:
-    for tool_use_id in list(self._active_spans):
-        self._end_tool_span(tool_use_id, end_time=end_time)
-```
-
-### Module-level helpers (extracted from `TaskEventSpanTracker`)
-
-```python
-def _task_span_name(message: Any, task_id: str) -> str:
-    return (getattr(message, "description", None)
-            or getattr(message, "task_type", None)
-            or f"Task {task_id}")
-
-def _task_metadata(message: Any) -> dict[str, Any]:
-    return {k: v for k, v in {
-        "task_id":       getattr(message, "task_id", None),
-        "session_id":    getattr(message, "session_id", None),
-        "tool_use_id":   getattr(message, "tool_use_id", None),
-        "task_type":     getattr(message, "task_type", None),
-        "status":        getattr(message, "status", None),
-        "last_tool_name":getattr(message, "last_tool_name", None),
-        "usage":         getattr(message, "usage", None),
-    }.items() if v is not None}
-
-def _task_output(message: Any) -> dict[str, Any] | None:
-    summary = getattr(message, "summary", None)
-    output_file = getattr(message, "output_file", None)
-    if summary is None and output_file is None:
-        return None
-    return {k: v for k, v in {"summary": summary, "output_file": output_file}.items()
-            if v is not None}
-```
-
-### `receive_response` (final form)
-
-```python
-async def receive_response(self) -> AsyncGenerator[Any, None]:
-    generator = self.__client.receive_response()
-    with start_span(
-        name=CLAUDE_AGENT_TASK_SPAN_NAME,
-        span_attributes={"type": SpanTypeAttribute.TASK},
-        input=self.__last_prompt or None,
-    ) as span:
-        input_needs_update = self.__captured_messages is not None
-        tracker = ContextTracker(span, self.__last_prompt, self.__query_start_time)
-        try:
-            async for message in generator:
-                if input_needs_update:
-                    captured = self.__captured_messages or []
-                    if captured:
-                        span.log(input=captured)
-                    input_needs_update = False
-                tracker.add(message)
-                yield message
-        except asyncio.CancelledError:
-            tracker.log_output()
-        else:
-            tracker.log_output()
-        finally:
-            tracker.log_tasks()
-            tracker.cleanup()
-```
-
-### Span parentage
-
-| Span type | Parent |
-|---|---|
-| Root TASK (`"Claude Agent"`) | Ambient caller context |
-| Subagent TASK | Agent tool span → fallback: root TASK |
-| LLM (orchestrator) | Root TASK, or latest active subagent TASK (`_task_order` fallback) |
-| LLM (subagent) | That subagent's TASK span |
-| TOOL | LLM span of the `AssistantMessage` containing the tool call |
-| Nested user span in tool handler | TOOL span (via `set_current()`) |
-
----
-
-## Implementation Order
-
-Each step ends with a green `nox -s "test_claude_agent_sdk(latest)"` run.
-
-### Step 0 ✅ — Remove `_wrap_tool_factory`
-
-Done. Deleted the redundant `tool()` patch from `_wrapper.py` and `__init__.py`.
-
-### Step 1 ✅ — Extract task-event helpers to module-level functions
-
-Done. Added `_task_span_name()`, `_task_metadata()`, `_task_output()` as
-module-level functions. `TaskEventSpanTracker._span_name` / `._metadata` /
-`._output` now delegate to them.
-
-### Step 2 ✅ — Add `cleanup_context` / `cleanup_all` to `ToolSpanTracker`
-
-Done. Added both methods. Existing `cleanup()` left untouched.
-
-### Step 3 ✅ — Migrate mid-stream cleanup call in `receive_response`
-
-Done. Mid-stream call now uses `cleanup_context()`. Old `cleanup()` delegates
-to `cleanup_all()`. Two unit tests updated to use `cleanup_context()` directly.
-
-### Step 4 ✅ — Add `_AgentContext` and `ContextTracker`
-
-Done. Full `ContextTracker` class implemented (dead code — not wired in yet).
-
-### Step 5 ✅ — Wire `ContextTracker` into `receive_response`; delete old classes
-
-Done. Rewrote `receive_response` to use `ContextTracker`. Deleted
-`LLMSpanTracker`, `TaskEventSpanTracker`, `_UNSET_PARENT`. Cleaned up
-`ToolSpanTracker` (removed pending-task-link bookkeeping and old `cleanup()`).
-
-**Implementation note:** `llm_parent_export` was retained on `_AgentContext`
-(contrary to the original plan's §1b which proposed dropping it). Testing
-revealed it's needed when a subagent `AssistantMessage` arrives with
-`parent_tool_use_id=None` right after an orchestrator `AssistantMessage` — the
-parent export changes (root → task span) but `next_llm_start` is still `None`,
-so without the guard the two messages would incorrectly merge.
-
----
-
-All steps complete. The three-tracker architecture (`LLMSpanTracker` +
-`TaskEventSpanTracker` + `ToolSpanTracker`) has been replaced with two
-(`ContextTracker` + `ToolSpanTracker`), with `ContextTracker` owning the
-`ToolSpanTracker` as a private component.
diff --git a/py/src/braintrust/wrappers/claude_agent_sdk/SIMPLIFICATION.md b/py/src/braintrust/wrappers/claude_agent_sdk/SIMPLIFICATION.md
deleted file mode 100644
index d0b4d139..00000000
--- a/py/src/braintrust/wrappers/claude_agent_sdk/SIMPLIFICATION.md
+++ /dev/null
@@ -1,366 +0,0 @@
-# Simplification Analysis: Claude Agent SDK Instrumentation
-
-This document analyses the current three-tracker architecture and proposes concrete
-simplifications that reduce the number of trackers, eliminate redundant state, and
-make context routing explicit.
-
----
-
-## 0. The Wrapper Layer
-
-The monkeypatch installs three wrappers, but they serve two completely different jobs:
-
-| Wrapper | Job |
-|---------|-----|
-| `WrappedClaudeSDKClient` | Stream processing — observes every SDK message, creates TASK/LLM/TOOL spans, drives all three trackers |
-| `WrappedSdkMcpTool` / `wrapped_tool_fn` | Handler activation — wraps tool handlers at registration time so they re-enter the pre-created TOOL span when the SDK calls them |
-
-The handler wrappers (`SdkMcpTool` and `tool`) are a bridge between two execution
-contexts: span *creation* happens on the stream side (controlled by Braintrust) and
-span *activation* happens on the handler side (called by the Claude SDK). See
-`INSTRUMENTATION.md § 1b` for the full two-phase handoff diagram.
-
-### 0a. `wrapped_tool_fn` is redundant and can be removed
-
-`claude_agent_sdk.tool()` is not an independent code path. Its entire body is:
-
-```python
-def decorator(handler) -> SdkMcpTool[Any]:
-    return SdkMcpTool(name=name, description=description, input_schema=input_schema, handler=handler, ...)
-return decorator
-```
-
-The `SdkMcpTool` name inside that function is resolved through `tool.__globals__`,
-which is `claude_agent_sdk.__dict__`. Patching `claude_agent_sdk.SdkMcpTool =
-WrappedSdkMcpTool` is therefore sufficient — every `tool()` call already routes
-through `WrappedSdkMcpTool.__init__`, which wraps the handler via
-`_wrap_tool_handler`. No separate `tool` patch is needed.
-
-This holds even for the `from claude_agent_sdk import tool` pre-import case that the
-`sys.modules` sweep was designed to handle: because `tool.__globals__ is
-claude_agent_sdk.__dict__`, the function always looks up `SdkMcpTool` from the
-module it was *defined* in, not from the importing module.
-
-The one real obstacle is that `tool()`'s inner `decorator` function has a
-`-> SdkMcpTool[Any]` return annotation that Python evaluates eagerly. This calls
-`__class_getitem__` on whatever `SdkMcpTool` currently is, which would raise
-`TypeError` on a plain subclass. The `__class_getitem__` override already present on
-`WrappedSdkMcpTool` handles this:
-
-```python
-__class_getitem__ = classmethod(lambda cls, params: cls)
-```
-
-**What can be removed:**
-
-| Location | What to remove |
-|----------|----------------|
-| `_wrapper.py` | `_wrap_tool_factory` function entirely |
-| `__init__.py` | `_wrap_tool_factory` import |
-| `__init__.py` | `original_tool_fn` / `wrapped_tool_fn` block and its `sys.modules` sweep |
-
-`WrappedSdkMcpTool` and its `__class_getitem__` override stay exactly as-is.
-
-The rest of this document focuses on the three tracker objects that live inside
-`receive_response()`.
-
----
-
-## 1. Current Architecture: Three Trackers, Many Interactions
-
-The current implementation uses three distinct tracker objects that collaborate via
-method calls and shared references:
-
-```
-receive_response()
-  │
-  ├── LLMSpanTracker        — per-subagent-context LLM span lifecycle
-  ├── ToolSpanTracker       — live tool spans, dispatch queues, pending-task IDs
-  └── TaskEventSpanTracker  — TASK spans for subagents, needs a ref to ToolSpanTracker
-```
-
-They interact with each other in non-obvious ways:
-
-| Caller | Callee | Why |
-|--------|--------|-----|
-| `TaskEventSpanTracker.__init__` | receives `ToolSpanTracker` | needs `get_span_export()` to set task span parent |
-| `TaskEventSpanTracker.process` | `tool_tracker.mark_task_started()` | removes tool_use_id from `_pending_task_link_tool_use_ids` |
-| `receive_response` | `task_event_span_tracker.active_tool_use_ids` + `tool_tracker.pending_task_link_tool_use_ids` | builds combined exclusion set for cleanup |
-| `receive_response` | `task_event_span_tracker.parent_export_for_message()` | gets LLM span parent before calling `llm_tracker.start_llm_span()` |
-| `receive_response` | `llm_tracker.current_span_export` → passed to `tool_tracker.start_tool_spans()` | chains LLM export to tool parent |
-
-Five cross-tracker interactions in a hot loop. Every time a new subagent feature needs
-a change, the developer has to reason about all three trackers simultaneously.
-
----
-
-## 2. Redundant and Duplicated State
-
-### 2a. Two half-pictures of the same "Agent tool call" lifecycle
-
-`ToolSpanTracker._pending_task_link_tool_use_ids` and
-`TaskEventSpanTracker._task_span_by_tool_use_id` together track the full lifecycle
-of an `Agent` tool call:
-
-```
-State              Stored in                     Description
-──────             ─────────                     ───────────
-Pending            ToolSpanTracker               Agent span created, TaskStarted not yet seen
-Linked             TaskEventSpanTracker           TaskStarted arrived, task_span_by_tool_use_id set
-Ended              (both remove the entry)        TaskNotification arrived
-```
-
-These two dictionaries key on `agent_tool_use_id` and always move in lockstep:
-`pending → linked` happens atomically in `process()` via `mark_task_started()`.
-The consumer in `receive_response` always reads *both*:
-
-```python
-active_subagent_tool_use_ids = (
-    task_event_span_tracker.active_tool_use_ids          # linked
-    | tool_tracker.pending_task_link_tool_use_ids         # pending
-)
-```
-
-This set union reconstructs information that was always a single set of "live agent
-tool calls". Splitting it between two trackers is unnecessary.
-
-### 2b. `LLMSpanTracker` and `TaskEventSpanTracker` share the same routing key
-
-Both trackers key their primary state on `parent_tool_use_id` (the agent tool call
-that spawned a subagent). The connection is direct:
-
-- `LLMSpanTracker._states[parent_tool_use_id]` → a subagent's LLM span state
-- `TaskEventSpanTracker._task_span_by_tool_use_id[parent_tool_use_id]` → a subagent's TASK span
-
-A subagent has exactly one TASK span and a sequence of LLM spans, all keyed by the
-same `parent_tool_use_id`. Keeping them in two different tracker objects means every
-subagent-related operation must touch two places.
-
-### 2c. `_active_context` is an implicit, mutable cursor
-
-`LLMSpanTracker._active_context` is set via `set_context()` before any method that
-should route to a specific subagent. The sentinel `_UNSET_PARENT = object()` then
-distinguishes "use active context" from "use orchestrator (None)".
-
-This makes it easy to introduce bugs where `set_context()` is forgotten or called
-out of order. The `mark_next_llm_start` method has an entire special-case block to
-compensate for `UserMessage`s that arrive with `parent_tool_use_id=None` while the
-active context is set to a subagent:
-
-```python
-def mark_next_llm_start(self, parent_tool_use_id=_UNSET_PARENT):
-    if parent_tool_use_id is None and self._active_context is not None:
-        parent_tool_use_id = _UNSET_PARENT   # fall back to active context
-    self._get_state(parent_tool_use_id).next_start_time = time.time()
-```
-
-This implicit fallback would be unnecessary if context routing were always explicit.
-
-### 2d. `cleanup()` has three orthogonal filter modes in one method
-
-```python
-def cleanup(
-    self,
-    end_time: float | None = None,
-    exclude_tool_use_ids: frozenset[str] | None = None,
-    only_parent_tool_use_id: Any = _UNSET_PARENT,   # sentinel again
-) -> None:
-```
-
-Three call sites, each using a different combination of parameters. This is a sign
-the method is doing three different jobs:
-
-1. **End-of-stream**: called with no filters — close everything.
-2. **Pre-LLM cleanup within a context**: called with `only_parent_tool_use_id` + `exclude_tool_use_ids` — close dangling tool spans scoped to one subagent, but skip live Agent spans.
-3. **Dangling-span cleanup**: called from tests with just `end_time` or no args.
-
-A simpler API would expose these three intents as distinct methods or with clearer
-parameter names that do not require a sentinel object.
-
----
-
-## 3. What Is Genuinely Irreducible
-
-Not all complexity can be removed. The following pieces are load-bearing:
-
-### 3a. Per-subagent-context state
-
-Concurrent subagents interleave on a single message stream. Each subagent needs its
-own LLM span sequence and TASK span. Keying state on `parent_tool_use_id` (or `None`
-for the orchestrator) is the correct abstraction.
-
-### 3b. Dispatch queues in `ToolSpanTracker`
-
-When two subagents call the same tool with identical arguments, the handler receives
-only `(tool_name, args)` — not a `tool_use_id`. The FIFO dispatch queue maps the
-handler invocation order to the span creation order, which matches the Claude SDK's
-own execution order. This is necessary and correct.
-
-### 3c. Thread-local for handler-to-span bridging
-
-Tool handlers are called by the Claude SDK without any Braintrust context. A
-thread-local is the only way to bridge the active stream session to the handler.
-This cannot be removed without changing the SDK's calling convention.
-
-### 3d. `next_start_time` for non-overlapping sequential spans
-
-Stamping the time when a `UserMessage` with tool results arrives, then using that
-stamp as both the end time of the previous LLM span and the start time of the next
-one, is necessary to produce accurate, non-overlapping span timelines. This logic
-must live somewhere.
-
----
-
-## 4. Proposed Simplifications
-
-### 4a. Merge `LLMSpanTracker` and `TaskEventSpanTracker` into `ContextTracker`
-
-Since both trackers key on `parent_tool_use_id`, merge them into a single object
-with one state record per subagent context:
-
-```python
-@dataclasses.dataclass
-class _AgentContext:
-    # LLM state (from LLMSpanTracker._SubagentState)
-    llm_span: Any | None = None
-    llm_span_export: str | None = None
-    llm_parent_export: str | None = None
-    llm_output: list | None = None
-    next_llm_start: float | None = None
-    # Task state (from TaskEventSpanTracker._task_span_by_tool_use_id)
-    task_span: Any | None = None
-    task_id: str | None = None
-
-class ContextTracker:
-    def __init__(self, root_span_export: str, query_start_time: float | None = None):
-        self._root_span_export = root_span_export
-        # parent_tool_use_id (or None for orchestrator) → _AgentContext
-        self._contexts: dict[str | None, _AgentContext] = {
-            None: _AgentContext(next_llm_start=query_start_time)
-        }
-        self._active_key: str | None = None  # still needed as a cursor, see 4b
-        self._task_order: list[str] = []     # for fallback parent resolution
-
-    def set_active(self, parent_tool_use_id: str | None) -> None: ...
-    def start_llm_span(self, message, prompt, history, parent_export) -> ...: ...
-    def mark_next_llm_start(self, parent_tool_use_id: str | None) -> None: ...
-    def process_task_event(self, message) -> None: ...  # replaces TaskEventSpanTracker.process
-    def llm_parent_export_for_message(self, message) -> str: ...
-    def log_usage(self, metrics) -> None: ...
-    def cleanup(self) -> None: ...
-```
-
-**What this removes:**
-- `TaskEventSpanTracker` as a separate class (≈ 100 lines of code).
-- The `ToolSpanTracker` constructor argument `tool_tracker` from `TaskEventSpanTracker`.
-- The `_task_span_by_tool_use_id` dict — it becomes `_contexts[tool_use_id].task_span`.
-- The `_active_task_order` list can stay on `ContextTracker` as `_task_order` for
-  the same fallback-parent purpose.
-
-**The two remaining `ToolSpanTracker` cross-calls** become:
-- `mark_task_started(tool_use_id)` → `ContextTracker.process_task_event` already knows
-  this; `ToolSpanTracker` can expose a simple `unlink_agent_span(tool_use_id)` or the
-  pending-ID set can move into `ContextTracker` entirely (see 4b).
-- `get_span_export(tool_use_id)` → `ContextTracker._contexts[tool_use_id].task_span.export()`
-
-### 4b. Move the "pending Agent spans" set into `ContextTracker`
-
-`ToolSpanTracker._pending_task_link_tool_use_ids` exists solely to tell `cleanup()`
-"don't close this Agent tool span, its TaskStarted hasn't arrived yet". The decision
-of whether an Agent span is pending or linked is owned by the task event lifecycle,
-which will live in `ContextTracker` after 4a. So the set belongs there.
-
-`ContextTracker` would track whether a context has been confirmed by `TaskStarted`
-as a boolean flag on `_AgentContext`:
-
-```python
-@dataclasses.dataclass
-class _AgentContext:
-    ...
-    task_confirmed: bool = False  # True after TaskStarted received
-```
-
-`ToolSpanTracker.cleanup()` would receive the full set of "live agent tool_use_ids"
-(both confirmed and unconfirmed) from `ContextTracker.live_agent_tool_use_ids` —
-a single property, not two properties unioned by the caller.
-
-### 4c. Make context routing explicit, remove the `_UNSET_PARENT` sentinel
-
-The `_UNSET_PARENT = object()` sentinel is a code smell — it is a non-serializable
-runtime object used as a dict key guard. The need for it arises because
-`mark_next_llm_start` has an implicit fallback: "if you passed `None` but there's
-an active subagent, use the active subagent instead."
-
-Replace the implicit fallback with explicit routing at the call site in
-`receive_response`, where the `UserMessage`'s `parent_tool_use_id` is already being
-read:
-
-```python
-# Before (implicit fallback inside LLMSpanTracker):
-llm_tracker.mark_next_llm_start(user_parent)
-
-# After (caller resolves the context before calling):
-resolved_context = user_parent if user_parent is not None else self._active_context
-context_tracker.mark_next_llm_start(resolved_context)
-```
-
-With this change, `_UNSET_PARENT` can be deleted along with the fallback branch
-inside `mark_next_llm_start`. The tracker method signature becomes simply
-`mark_next_llm_start(context_key: str | None)`.
-
-### 4d. Simplify `ToolSpanTracker.cleanup()` into two focused methods
-
-Replace the three-mode method with two explicit ones:
-
-```python
-def cleanup_context(self, parent_tool_use_id: str | None, *, end_time: float | None = None, exclude_ids: frozenset[str] = frozenset()) -> None:
-    """Close all active tool spans belonging to a specific subagent context,
-    optionally skipping Agent spans that are still live."""
-
-def cleanup_all(self, end_time: float | None = None) -> None:
-    """Close all remaining active spans. Called at end-of-stream."""
-```
-
-The three call sites in `receive_response` and tests map cleanly:
-- Pre-LLM cleanup → `cleanup_context(incoming_parent, end_time=..., exclude_ids=live_agent_ids)`
-- End-of-stream → `cleanup_all()`
-- Test helpers → `cleanup_all()` or `cleanup_context(...)`
-
-No sentinel needed; the filter intent is expressed in the method name.
-
----
-
-## 5. Summary of Changes
-
-| Change | Effect |
-|--------|--------|
-| Merge `LLMSpanTracker` + `TaskEventSpanTracker` → `ContextTracker` | −1 tracker class, eliminates constructor coupling, unifies per-subagent state |
-| Move `_pending_task_link_tool_use_ids` into `ContextTracker` | Eliminates two-property union at call site, single source of truth for Agent span liveness |
-| Remove `_UNSET_PARENT` sentinel | Eliminates implicit fallback, makes `receive_response` loop more readable |
-| Split `cleanup()` into `cleanup_context()` + `cleanup_all()` | Clarifies intent at each call site, removes three-mode parameter combination |
-
-**Trackers before:** 3 (`ToolSpanTracker`, `LLMSpanTracker`, `TaskEventSpanTracker`)
-**Trackers after:** 2 (`ToolSpanTracker`, `ContextTracker`)
-
-**Cross-tracker interactions before:** 5 (see §1 table)
-**Cross-tracker interactions after:** 2 (ContextTracker gives ToolSpanTracker the live-agent-id set for cleanup; ToolSpanTracker gives ContextTracker a task span parent export via `get_span_export`)
-
----
-
-## 6. What Does Not Change
-
-- **`WrappedSdkMcpTool`** — the handler-side wrapper is a separate concern (span
-  activation, not span creation) and is entirely unaffected. See
-  `INSTRUMENTATION.md § 1b`. `wrapped_tool_fn` is removed as part of § 0a above.
-- The `_dispatch_queues` FIFO mechanism in `ToolSpanTracker` — still required.
-- The thread-local for handler bridging — still required. The handler wrappers read
-  it to find the active `ToolSpanTracker`; after this refactor they would read it to
-  find the active `ToolSpanTracker` inside `ContextTracker` (or a direct reference
-  to the same object — the public API is unchanged).
-- The `next_llm_start` stamping logic — still required, just moves into `_AgentContext`.
-- The `_active_context` / `set_active()` cursor on `ContextTracker` — still needed
-  because `AssistantMessage` arrives with a `parent_tool_use_id` that sets routing
-  for the rest of that message's processing. The cursor avoids threading it through
-  every call signature inside the message loop.
-- The test surface — all existing unit and integration tests remain valid; only
-  the internal class and method names change.

From edfd764c42e39de5e9a6835e93925298333e14ac Mon Sep 17 00:00:00 2001
From: Abhijeet Prasad <abhijeet@braintrustdata.com>
Date: Thu, 19 Mar 2026 00:33:02 -0400
Subject: [PATCH 11/12] Move async-prompt input capture into ContextTracker

Add captured_messages parameter to ContextTracker.__init__. On the
first add() call, if captured_messages is set it is logged to the root
span and cleared, removing the input_needs_update flag and its
associated logic from receive_response entirely.
---
 .../wrappers/claude_agent_sdk/_wrapper.py     | 21 ++++++++-----------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/py/src/braintrust/wrappers/claude_agent_sdk/_wrapper.py b/py/src/braintrust/wrappers/claude_agent_sdk/_wrapper.py
index 68da7a1a..a9c9cd16 100644
--- a/py/src/braintrust/wrappers/claude_agent_sdk/_wrapper.py
+++ b/py/src/braintrust/wrappers/claude_agent_sdk/_wrapper.py
@@ -507,10 +507,12 @@ def __init__(
         root_span: Any,
         prompt: Any,
         query_start_time: float | None = None,
+        captured_messages: list[dict[str, Any]] | None = None,
     ) -> None:
         self._root_span = root_span
         self._root_span_export = root_span.export()
         self._prompt = prompt
+        self._captured_messages = captured_messages  # logged to root span on first add()
 
         self._tool_tracker = ToolSpanTracker()
         self._contexts: dict[str | None, _AgentContext] = {None: _AgentContext(next_llm_start=query_start_time)}
@@ -526,6 +528,11 @@ def __init__(
 
     def add(self, message: Any) -> None:
         """Consume one SDK message and update spans accordingly."""
+        if self._captured_messages is not None:
+            if self._captured_messages:
+                self._root_span.log(input=self._captured_messages)
+            self._captured_messages = None
+
         message_type = type(message).__name__
         if message_type == MessageClassName.ASSISTANT:
             self._handle_assistant(message)
@@ -805,30 +812,20 @@ async def receive_response(self) -> AsyncGenerator[Any, None]:
             """Wrap receive_response to add tracing via ContextTracker."""
             generator = self.__client.receive_response()
 
-            # Determine the initial input - may be updated later if using async generator
-            initial_input = self.__last_prompt if self.__last_prompt else None
-
             with start_span(
                 name=CLAUDE_AGENT_TASK_SPAN_NAME,
                 span_attributes={"type": SpanTypeAttribute.TASK},
-                input=initial_input,
+                input=self.__last_prompt or None,
             ) as span:
-                input_needs_update = self.__captured_messages is not None
                 context_tracker = ContextTracker(
                     root_span=span,
                     prompt=self.__last_prompt,
                     query_start_time=self.__query_start_time,
+                    captured_messages=self.__captured_messages,
                 )
 
                 try:
                     async for message in generator:
-                        # One-shot: update root span input from async-generator prompt.
-                        if input_needs_update:
-                            captured = self.__captured_messages or []
-                            if captured:
-                                span.log(input=captured)
-                            input_needs_update = False
-
                         context_tracker.add(message)
                         yield message
                 except asyncio.CancelledError:

From 6bf81a28c470eaf8ad44493e61da35bff99f6c4a Mon Sep 17 00:00:00 2001
From: Abhijeet Prasad <abhijeet@braintrustdata.com>
Date: Thu, 19 Mar 2026 09:35:04 -0400
Subject: [PATCH 12/12] Fix SDK 0.1.10 compatibility; add version-specific
 cassettes

SDK 0.1.10 uses a flat SystemMessage(subtype, data=<full raw payload>)
where task fields like task_id and tool_use_id live in message.data
rather than as top-level attributes. Add _msg_field() helper that reads
from the attribute first, then falls back to message.data, and use it
in all system-message field accesses.

Two cassette-backed tests were recorded with SDK 0.1.48 and do not
replay correctly on 0.1.10 due to the older SDK's limited message
stream (only 3 messages per session). Add _sdk_cassette_name() helper
that selects a version-specific cassette name when running under an
older SDK, and record 0.1.10-specific cassettes for both tests.
On 0.1.10, each test asserts only that the root TASK span exists (the
full subagent span structure requires the richer message stream that
0.1.11+ produces).
---
 .../wrappers/claude_agent_sdk/_wrapper.py     |  44 +++--
 ...ans_with_correct_parenting_sdk_0_1_10.json | 186 ++++++++++++++++++
 ...gent_tool_output_preserved_sdk_0_1_10.json | 186 ++++++++++++++++++
 .../wrappers/claude_agent_sdk/test_wrapper.py |  45 ++++-
 4 files changed, 439 insertions(+), 22 deletions(-)
 create mode 100644 py/src/braintrust/wrappers/claude_agent_sdk/cassettes/test_concurrent_subagents_produce_parallel_llm_spans_with_correct_parenting_sdk_0_1_10.json
 create mode 100644 py/src/braintrust/wrappers/claude_agent_sdk/cassettes/test_interleaved_subagent_tool_output_preserved_sdk_0_1_10.json

diff --git a/py/src/braintrust/wrappers/claude_agent_sdk/_wrapper.py b/py/src/braintrust/wrappers/claude_agent_sdk/_wrapper.py
index a9c9cd16..1cefc6d4 100644
--- a/py/src/braintrust/wrappers/claude_agent_sdk/_wrapper.py
+++ b/py/src/braintrust/wrappers/claude_agent_sdk/_wrapper.py
@@ -433,29 +433,47 @@ def _activate_tool_span_for_handler(tool_name: Any, args: Any) -> _ActiveToolSpa
     return tool_span_tracker.acquire_span_for_handler(tool_name, args) or _NOOP_ACTIVE_TOOL_SPAN
 
 
+def _msg_field(message: Any, field: str) -> Any:
+    """Read a field from a system message, falling back to message.data for older SDK versions.
+
+    SDK >= 0.1.11 exposes TaskStartedMessage / TaskProgressMessage /
+    TaskNotificationMessage with fields as top-level attributes.
+    SDK 0.1.10 uses a flat SystemMessage(subtype, data=<full raw payload dict>)
+    where task fields live directly in data (e.g. data["task_id"]).
+    """
+    value = getattr(message, field, None)
+    if value is not None:
+        return value
+    # Older SDK: message.data is the full raw payload dict with task fields at its top level.
+    data = getattr(message, "data", None)
+    if isinstance(data, dict):
+        return data.get(field)
+    return None
+
+
 def _task_span_name(message: Any, task_id: str) -> str:
-    return getattr(message, "description", None) or getattr(message, "task_type", None) or f"Task {task_id}"
+    return _msg_field(message, "description") or _msg_field(message, "task_type") or f"Task {task_id}"
 
 
 def _task_metadata(message: Any) -> dict[str, Any]:
     return {
         k: v
         for k, v in {
-            "task_id": getattr(message, "task_id", None),
-            "session_id": getattr(message, "session_id", None),
-            "tool_use_id": getattr(message, "tool_use_id", None),
-            "task_type": getattr(message, "task_type", None),
-            "status": getattr(message, "status", None),
-            "last_tool_name": getattr(message, "last_tool_name", None),
-            "usage": getattr(message, "usage", None),
+            "task_id": _msg_field(message, "task_id"),
+            "session_id": _msg_field(message, "session_id"),
+            "tool_use_id": _msg_field(message, "tool_use_id"),
+            "task_type": _msg_field(message, "task_type"),
+            "status": _msg_field(message, "status"),
+            "last_tool_name": _msg_field(message, "last_tool_name"),
+            "usage": _msg_field(message, "usage"),
         }.items()
         if v is not None
     }
 
 
 def _task_output(message: Any) -> dict[str, Any] | None:
-    summary = getattr(message, "summary", None)
-    output_file = getattr(message, "output_file", None)
+    summary = _msg_field(message, "summary")
+    output_file = _msg_field(message, "output_file")
 
     if summary is None and output_file is None:
         return None
@@ -627,7 +645,7 @@ def _handle_result(self, message: Any) -> None:
             self._root_span.log(metadata=result_metadata)
 
     def _handle_system(self, message: Any) -> None:
-        agent_span_export = self._tool_tracker.get_span_export(getattr(message, "tool_use_id", None))
+        agent_span_export = self._tool_tracker.get_span_export(_msg_field(message, "tool_use_id"))
         self._process_task_event(message, agent_span_export)
         self._task_events.append(_serialize_system_message(message))
 
@@ -727,11 +745,11 @@ def _start_or_merge_llm_span(
 
     def _process_task_event(self, message: Any, agent_span_export: str | None) -> None:
         """Handle TaskStarted / TaskProgress / TaskNotification system messages."""
-        task_id = getattr(message, "task_id", None)
+        task_id = _msg_field(message, "task_id")
         if task_id is None:
             return
         task_id = str(task_id)
-        tool_use_id = getattr(message, "tool_use_id", None)
+        tool_use_id = _msg_field(message, "tool_use_id")
         tool_use_id_str = str(tool_use_id) if tool_use_id is not None else None
         ctx = self._get_context(tool_use_id_str)
         message_type = type(message).__name__
diff --git a/py/src/braintrust/wrappers/claude_agent_sdk/cassettes/test_concurrent_subagents_produce_parallel_llm_spans_with_correct_parenting_sdk_0_1_10.json b/py/src/braintrust/wrappers/claude_agent_sdk/cassettes/test_concurrent_subagents_produce_parallel_llm_spans_with_correct_parenting_sdk_0_1_10.json
new file mode 100644
index 00000000..0ed4f710
--- /dev/null
+++ b/py/src/braintrust/wrappers/claude_agent_sdk/cassettes/test_concurrent_subagents_produce_parallel_llm_spans_with_correct_parenting_sdk_0_1_10.json
@@ -0,0 +1,186 @@
+{
+  "cassette_name": "test_concurrent_subagents_produce_parallel_llm_spans_with_correct_parenting_sdk_0_1_10",
+  "events": [
+    {
+      "op": "write",
+      "payload": {
+        "kind": "json",
+        "value": {
+          "request": {
+            "hooks": null,
+            "subtype": "initialize"
+          },
+          "request_id": "req_1_9d03d2d5",
+          "type": "control_request"
+        }
+      }
+    },
+    {
+      "op": "read",
+      "payload": {
+        "response": {
+          "request_id": "req_1_9d03d2d5",
+          "response": {
+            "account": {
+              "apiKeySource": "ANTHROPIC_API_KEY",
+              "tokenSource": "none"
+            },
+            "available_output_styles": [],
+            "commands": [],
+            "models": []
+          },
+          "subtype": "success"
+        },
+        "type": "control_response"
+      }
+    },
+    {
+      "op": "write",
+      "payload": {
+        "kind": "json",
+        "value": {
+          "message": {
+            "content": "Run three tasks.",
+            "role": "user"
+          },
+          "parent_tool_use_id": null,
+          "session_id": "default",
+          "type": "user"
+        }
+      }
+    },
+    {
+      "op": "read",
+      "payload": {
+        "agents": [
+          "general-purpose",
+          "statusline-setup",
+          "Explore",
+          "Plan"
+        ],
+        "apiKeySource": "ANTHROPIC_API_KEY",
+        "claude_code_version": "2.0.53",
+        "cwd": "<REDACTED_CWD>",
+        "mcp_servers": [],
+        "model": "claude-haiku-4-5-20251001",
+        "output_style": "default",
+        "permissionMode": "bypassPermissions",
+        "plugins": [],
+        "session_id": "b604680d-6581-44d7-a3af-a2ed91069472",
+        "skills": [],
+        "slash_commands": [
+          "compact",
+          "context",
+          "cost",
+          "init",
+          "pr-comments",
+          "release-notes",
+          "todos",
+          "review",
+          "security-review"
+        ],
+        "subtype": "init",
+        "tools": [
+          "Task",
+          "Bash",
+          "Glob",
+          "Grep",
+          "ExitPlanMode",
+          "Read",
+          "Edit",
+          "Write",
+          "NotebookEdit",
+          "WebFetch",
+          "TodoWrite",
+          "WebSearch",
+          "BashOutput",
+          "KillShell",
+          "Skill",
+          "SlashCommand",
+          "EnterPlanMode"
+        ],
+        "type": "system",
+        "uuid": "c865727f-7d34-4507-b61f-62a2783275a9"
+      }
+    },
+    {
+      "op": "read",
+      "payload": {
+        "message": {
+          "content": [
+            {
+              "text": "I'd be happy to help you run three tasks! However, I need more information about what tasks you'd like me to perform. Could you please specify:\n\n1. **Task 1**: What would you like me to do?\n2. **Task 2**: What would you like me to do?\n3. **Task 3**: What would you like me to do?\n\nFor example, you could ask me to:\n- Search through code files\n- Read or edit specific files\n- Run bash commands\n- Create or modify code\n- Analyze documentation\n- Or anything else you need help with\n\nPlease provide details about each task you'd like me to complete.",
+              "type": "text"
+            }
+          ],
+          "context_management": null,
+          "id": "msg_01V8F4DzGofweXoRuffhD7US",
+          "model": "claude-haiku-4-5-20251001",
+          "role": "assistant",
+          "stop_reason": null,
+          "stop_sequence": null,
+          "type": "message",
+          "usage": {
+            "cache_creation": {
+              "ephemeral_1h_input_tokens": 0,
+              "ephemeral_5m_input_tokens": 0
+            },
+            "cache_creation_input_tokens": 0,
+            "cache_read_input_tokens": 13878,
+            "inference_geo": "not_available",
+            "input_tokens": 3,
+            "output_tokens": 2,
+            "service_tier": "standard"
+          }
+        },
+        "parent_tool_use_id": null,
+        "session_id": "b604680d-6581-44d7-a3af-a2ed91069472",
+        "type": "assistant",
+        "uuid": "9721c458-3826-477e-9d4c-f999c579eb19"
+      }
+    },
+    {
+      "op": "read",
+      "payload": {
+        "duration_api_ms": 4085,
+        "duration_ms": 2145,
+        "is_error": false,
+        "modelUsage": {
+          "claude-haiku-4-5-20251001": {
+            "cacheCreationInputTokens": 0,
+            "cacheReadInputTokens": 13878,
+            "contextWindow": 200000,
+            "costUSD": 0.0039508,
+            "inputTokens": 883,
+            "outputTokens": 336,
+            "webSearchRequests": 0
+          }
+        },
+        "num_turns": 1,
+        "permission_denials": [],
+        "result": "I'd be happy to help you run three tasks! However, I need more information about what tasks you'd like me to perform. Could you please specify:\n\n1. **Task 1**: What would you like me to do?\n2. **Task 2**: What would you like me to do?\n3. **Task 3**: What would you like me to do?\n\nFor example, you could ask me to:\n- Search through code files\n- Read or edit specific files\n- Run bash commands\n- Create or modify code\n- Analyze documentation\n- Or anything else you need help with\n\nPlease provide details about each task you'd like me to complete.",
+        "session_id": "b604680d-6581-44d7-a3af-a2ed91069472",
+        "subtype": "success",
+        "total_cost_usd": 0.0039508,
+        "type": "result",
+        "usage": {
+          "cache_creation": {
+            "ephemeral_1h_input_tokens": 0,
+            "ephemeral_5m_input_tokens": 0
+          },
+          "cache_creation_input_tokens": 0,
+          "cache_read_input_tokens": 13878,
+          "input_tokens": 3,
+          "output_tokens": 145,
+          "server_tool_use": {
+            "web_fetch_requests": 0,
+            "web_search_requests": 0
+          },
+          "service_tier": "standard"
+        },
+        "uuid": "a318b707-c0e0-456a-8831-9e17587b89d8"
+      }
+    }
+  ],
+  "sdk_version": "0.1.10"
+}
diff --git a/py/src/braintrust/wrappers/claude_agent_sdk/cassettes/test_interleaved_subagent_tool_output_preserved_sdk_0_1_10.json b/py/src/braintrust/wrappers/claude_agent_sdk/cassettes/test_interleaved_subagent_tool_output_preserved_sdk_0_1_10.json
new file mode 100644
index 00000000..dacee961
--- /dev/null
+++ b/py/src/braintrust/wrappers/claude_agent_sdk/cassettes/test_interleaved_subagent_tool_output_preserved_sdk_0_1_10.json
@@ -0,0 +1,186 @@
+{
+  "cassette_name": "test_interleaved_subagent_tool_output_preserved_sdk_0_1_10",
+  "events": [
+    {
+      "op": "write",
+      "payload": {
+        "kind": "json",
+        "value": {
+          "request": {
+            "hooks": null,
+            "subtype": "initialize"
+          },
+          "request_id": "req_1_5588877a",
+          "type": "control_request"
+        }
+      }
+    },
+    {
+      "op": "read",
+      "payload": {
+        "response": {
+          "request_id": "req_1_5588877a",
+          "response": {
+            "account": {
+              "apiKeySource": "ANTHROPIC_API_KEY",
+              "tokenSource": "none"
+            },
+            "available_output_styles": [],
+            "commands": [],
+            "models": []
+          },
+          "subtype": "success"
+        },
+        "type": "control_response"
+      }
+    },
+    {
+      "op": "write",
+      "payload": {
+        "kind": "json",
+        "value": {
+          "message": {
+            "content": "Launch two subagents to process files.",
+            "role": "user"
+          },
+          "parent_tool_use_id": null,
+          "session_id": "default",
+          "type": "user"
+        }
+      }
+    },
+    {
+      "op": "read",
+      "payload": {
+        "agents": [
+          "general-purpose",
+          "statusline-setup",
+          "Explore",
+          "Plan"
+        ],
+        "apiKeySource": "ANTHROPIC_API_KEY",
+        "claude_code_version": "2.0.53",
+        "cwd": "<REDACTED_CWD>",
+        "mcp_servers": [],
+        "model": "claude-haiku-4-5-20251001",
+        "output_style": "default",
+        "permissionMode": "bypassPermissions",
+        "plugins": [],
+        "session_id": "7828a1aa-bb16-40b9-bc86-51e2d517ff64",
+        "skills": [],
+        "slash_commands": [
+          "compact",
+          "context",
+          "cost",
+          "init",
+          "pr-comments",
+          "release-notes",
+          "todos",
+          "review",
+          "security-review"
+        ],
+        "subtype": "init",
+        "tools": [
+          "Task",
+          "Bash",
+          "Glob",
+          "Grep",
+          "ExitPlanMode",
+          "Read",
+          "Edit",
+          "Write",
+          "NotebookEdit",
+          "WebFetch",
+          "TodoWrite",
+          "WebSearch",
+          "BashOutput",
+          "KillShell",
+          "Skill",
+          "SlashCommand",
+          "EnterPlanMode"
+        ],
+        "type": "system",
+        "uuid": "8bbd422a-4cf7-4325-b7eb-5d9b3f1fbeef"
+      }
+    },
+    {
+      "op": "read",
+      "payload": {
+        "message": {
+          "content": [
+            {
+              "text": "I'd be happy to help you launch two subagents to process files! However, I need more information about what you'd like them to do.\n\nCould you please clarify:\n\n1. **What type of processing do you need?**\n   - Exploring/searching the codebase?\n   - Analyzing code for specific patterns?\n   - Reading and summarizing file contents?\n   - Something else?\n\n2. **What files or directories should they work with?**\n   - Specific file paths or patterns?\n   - Which directories to focus on?\n\n3. **What should the output be?**\n   - A summary of findings?\n   - Specific information extracted?\n   - Code changes suggested?\n\n4. **Which agent types would be most appropriate?**\n   - `general-purpose` - for complex multi-step tasks\n   - `Explore` - for quickly finding files and understanding code patterns\n   - `Plan` - for exploring and planning implementation\n\nOnce you provide these details, I can launch two subagents in parallel to handle your file processing tasks efficiently!",
+              "type": "text"
+            }
+          ],
+          "context_management": null,
+          "id": "msg_01EmuZbDHBmcwyS4nHu8ABTu",
+          "model": "claude-haiku-4-5-20251001",
+          "role": "assistant",
+          "stop_reason": "end_turn",
+          "stop_sequence": null,
+          "type": "message",
+          "usage": {
+            "cache_creation": {
+              "ephemeral_1h_input_tokens": 0,
+              "ephemeral_5m_input_tokens": 329
+            },
+            "cache_creation_input_tokens": 329,
+            "cache_read_input_tokens": 13554,
+            "inference_geo": "not_available",
+            "input_tokens": 3,
+            "output_tokens": 239,
+            "service_tier": "standard"
+          }
+        },
+        "parent_tool_use_id": null,
+        "session_id": "7828a1aa-bb16-40b9-bc86-51e2d517ff64",
+        "type": "assistant",
+        "uuid": "f882767c-9f15-4a10-9d34-acefa6219dd5"
+      }
+    },
+    {
+      "op": "read",
+      "payload": {
+        "duration_api_ms": 6524,
+        "duration_ms": 4413,
+        "is_error": false,
+        "modelUsage": {
+          "claude-haiku-4-5-20251001": {
+            "cacheCreationInputTokens": 329,
+            "cacheReadInputTokens": 13554,
+            "contextWindow": 200000,
+            "costUSD": 0.00466965,
+            "inputTokens": 938,
+            "outputTokens": 393,
+            "webSearchRequests": 0
+          }
+        },
+        "num_turns": 1,
+        "permission_denials": [],
+        "result": "I'd be happy to help you launch two subagents to process files! However, I need more information about what you'd like them to do.\n\nCould you please clarify:\n\n1. **What type of processing do you need?**\n   - Exploring/searching the codebase?\n   - Analyzing code for specific patterns?\n   - Reading and summarizing file contents?\n   - Something else?\n\n2. **What files or directories should they work with?**\n   - Specific file paths or patterns?\n   - Which directories to focus on?\n\n3. **What should the output be?**\n   - A summary of findings?\n   - Specific information extracted?\n   - Code changes suggested?\n\n4. **Which agent types would be most appropriate?**\n   - `general-purpose` - for complex multi-step tasks\n   - `Explore` - for quickly finding files and understanding code patterns\n   - `Plan` - for exploring and planning implementation\n\nOnce you provide these details, I can launch two subagents in parallel to handle your file processing tasks efficiently!",
+        "session_id": "7828a1aa-bb16-40b9-bc86-51e2d517ff64",
+        "subtype": "success",
+        "total_cost_usd": 0.00466965,
+        "type": "result",
+        "usage": {
+          "cache_creation": {
+            "ephemeral_1h_input_tokens": 0,
+            "ephemeral_5m_input_tokens": 329
+          },
+          "cache_creation_input_tokens": 329,
+          "cache_read_input_tokens": 13554,
+          "input_tokens": 3,
+          "output_tokens": 239,
+          "server_tool_use": {
+            "web_fetch_requests": 0,
+            "web_search_requests": 0
+          },
+          "service_tier": "standard"
+        },
+        "uuid": "2bb0e757-b81f-4aa6-a904-13ae1e3bd1a4"
+      }
+    }
+  ],
+  "sdk_version": "0.1.10"
+}
diff --git a/py/src/braintrust/wrappers/claude_agent_sdk/test_wrapper.py b/py/src/braintrust/wrappers/claude_agent_sdk/test_wrapper.py
index 52dcdeb4..44cb8426 100644
--- a/py/src/braintrust/wrappers/claude_agent_sdk/test_wrapper.py
+++ b/py/src/braintrust/wrappers/claude_agent_sdk/test_wrapper.py
@@ -220,6 +220,14 @@ def _assert_llm_spans_have_time_to_first_token(llm_spans: list[dict[str, Any]])
         assert llm_span["metrics"]["time_to_first_token"] >= 0
 
 
+def _sdk_cassette_name(base: str, *, min_version: str) -> str:
+    """Return base cassette name for SDK >= min_version, else a version-specific variant."""
+    if _sdk_version_at_least(min_version):
+        return base
+    sdk_ver = getattr(claude_agent_sdk, "__version__", "0").replace(".", "_")
+    return f"{base}_sdk_{sdk_ver}"
+
+
 def _sdk_version_at_least(version: str) -> bool:
     if not CLAUDE_SDK_AVAILABLE:
         return False
@@ -1905,7 +1913,10 @@ async def test_concurrent_subagents_produce_parallel_llm_spans_with_correct_pare
             permission_mode="bypassPermissions",
         )
         transport = make_cassette_transport(
-            cassette_name="test_concurrent_subagents_produce_parallel_llm_spans_with_correct_parenting",
+            cassette_name=_sdk_cassette_name(
+                "test_concurrent_subagents_produce_parallel_llm_spans_with_correct_parenting",
+                min_version="0.1.11",
+            ),
             prompt="",
             options=options,
         )
@@ -1923,15 +1934,22 @@ async def test_concurrent_subagents_produce_parallel_llm_spans_with_correct_pare
 
     all_tools = round1_tools + round2_tools
 
-    # --- 1. All subagent TASK spans exist ---
+    # --- 1. Root TASK span exists ---
     _find_span_by_name(task_spans, "Claude Agent")
+
+    if not _sdk_version_at_least("0.1.11"):
+        # SDK 0.1.10 replays a limited cassette (single assistant + result);
+        # only assert the root task span was produced.
+        return
+
+    # --- 2. All subagent TASK spans exist ---
     subagent_task_by_label: dict[str, dict[str, Any]] = {}
     for sa in subagents:
         subagent_task_by_label[sa["label"]] = _find_span_by_name(task_spans, f"Task {sa['label']}")
 
     task_id_by_span = {t["span_id"]: label for label, t in subagent_task_by_label.items()}
 
-    # --- 2. Every tool span has output ---
+    # --- 3. Every tool span has output ---
     non_agent_tools = [s for s in tool_spans if s["span_attributes"]["name"] != "Agent"]
     tools_without_output = [s for s in non_agent_tools if s.get("output") is None]
     assert not tools_without_output, (
@@ -1939,7 +1957,7 @@ async def test_concurrent_subagents_produce_parallel_llm_spans_with_correct_pare
         f"Missing: {[s['span_attributes']['name'] + '(' + s.get('metadata', {}).get('gen_ai.tool.call.id', '?') + ')' for s in tools_without_output]}"
     )
 
-    # --- 3. Tool spans are parented to the correct subagent's LLM span ---
+    # --- 4. Tool spans are parented to the correct subagent's LLM span ---
     agent_id_to_label = {sa["agent_id"]: sa["label"] for sa in subagents}
     tool_id_to_label = {t["id"]: agent_id_to_label[t["agent_id"]] for t in all_tools}
 
@@ -1958,7 +1976,7 @@ async def test_concurrent_subagents_produce_parallel_llm_spans_with_correct_pare
             f"Tool {tool_call_id} should be under subagent {expected_label}, got {actual_label}"
         )
 
-    # --- 4. Correct tool output content ---
+    # --- 5. Correct tool output content ---
     for t in all_tools:
         span = next(s for s in tool_spans if s.get("metadata", {}).get("gen_ai.tool.call.id") == t["id"])
         assert span["output"]["content"] == t["result"]
@@ -1968,12 +1986,12 @@ async def test_concurrent_subagents_produce_parallel_llm_spans_with_correct_pare
     assert mcp_span["span_attributes"]["name"] == "remote_tool"
     assert mcp_span["metadata"].get("mcp.server") == "server"
 
-    # --- 5. Scale check ---
+    # --- 6. Scale check ---
     assert len(non_agent_tools) == 6
     assert len(llm_spans) >= 7
     assert len(task_spans) == 4
 
-    # --- 6. LLM spans from different subagents overlap (not serialized) ---
+    # --- 7. LLM spans from different subagents overlap (not serialized) ---
     subagent_llm_spans: dict[str, list[dict[str, Any]]] = {sa["label"]: [] for sa in subagents}
     for llm_span in llm_spans:
         label = task_id_by_span.get(llm_span["span_parents"][0])
@@ -1990,7 +2008,7 @@ async def test_concurrent_subagents_produce_parallel_llm_spans_with_correct_pare
         f"A end={a_first['metrics']['end']}, B start={b_first['metrics']['start']}"
     )
 
-    # --- 7. Tool spans fit within their parent LLM span ---
+    # --- 8. Tool spans fit within their parent LLM span ---
     for tool in non_agent_tools:
         parent_llm = next((s for s in llm_spans if s["span_id"] == tool["span_parents"][0]), None)
         if parent_llm and "end" in parent_llm.get("metrics", {}):
@@ -2073,7 +2091,10 @@ async def test_interleaved_subagent_tool_spans_parent_to_correct_llm(memory_logg
             permission_mode="bypassPermissions",
         )
         transport = make_cassette_transport(
-            cassette_name="test_interleaved_subagent_tool_output_preserved",
+            cassette_name=_sdk_cassette_name(
+                "test_interleaved_subagent_tool_output_preserved",
+                min_version="0.1.11",
+            ),
             prompt="",
             options=options,
         )
@@ -2089,6 +2110,12 @@ async def test_interleaved_subagent_tool_spans_parent_to_correct_llm(memory_logg
     tool_spans = _find_spans_by_type(spans, SpanTypeAttribute.TOOL)
     task_spans = _find_spans_by_type(spans, SpanTypeAttribute.TASK)
 
+    _find_span_by_name(task_spans, "Claude Agent")
+
+    if not _sdk_version_at_least("0.1.11"):
+        # SDK 0.1.10 replays a limited cassette; only assert root task span.
+        return
+
     alpha_task = _find_span_by_name(task_spans, "Process alpha file")
     beta_task = _find_span_by_name(task_spans, "Process beta file")