databricks · dhruv0811 · Apr 16, 2026 · Apr 16, 2026 · Apr 16, 2026 · Apr 16, 2026
diff --git a/integrations/langchain/src/databricks_langchain/checkpoint.py b/integrations/langchain/src/databricks_langchain/checkpoint.py
@@ -1,8 +1,13 @@
 from __future__ import annotations
 
-from typing import Any
+import copy
+import logging
+from typing import Any, Sequence
 
 from databricks.sdk import WorkspaceClient
+from databricks_ai_bridge.tool_repair import DEFAULT_SYNTHETIC_INTERRUPTED_OUTPUT
+
+logger = logging.getLogger(__name__)
 
 try:
     from databricks_ai_bridge.lakebase import AsyncLakebasePool, LakebaseClient, LakebasePool
@@ -16,6 +21,109 @@
 
     _checkpoint_imports_available = False
 
+try:
+    from langchain_core.messages import AIMessage, ToolMessage
+
+    _message_imports_available = True
+except ImportError:
+    AIMessage = object  # type: ignore
+    ToolMessage = object  # type: ignore
+    _message_imports_available = False
+
+
+def _build_tool_resume_repair(messages: Sequence[Any]) -> list[Any]:
+    """Build synthetic ``ToolMessage`` responses for orphan tool calls.
+
+    Internal helper used by ``_repair_loaded_checkpoint_tuple``. When a
+    LangGraph run is killed mid-tool, the checkpointer preserves the
+    trailing ``AIMessage.tool_calls`` but the paired ``ToolMessage``s
+    never land. Replaying that state to the LLM fails because the API
+    (Anthropic in particular) requires every ``tool_use`` to be
+    immediately followed by a matching ``tool_result``.
+
+    Walks the trailing assistant turn (the last contiguous block of
+    ``AIMessage`` / ``ToolMessage``) and returns a synthetic
+    ``ToolMessage`` for each ``tool_call`` id that lacks a matching
+    ``ToolMessage.tool_call_id``. The caller appends these to the
+    ``messages`` channel before the next model call.
+    """
+    if not _message_imports_available or not messages:
+        return []
+
+    # Trailing assistant turn: walk backwards until we hit a non-assistant/
+    # non-tool message. That block is the "pending" turn whose tool_use ↔
+    # tool_result pairing we need to enforce.
+    trailing_start = len(messages)
+    for i in range(len(messages) - 1, -1, -1):
+        if isinstance(messages[i], (AIMessage, ToolMessage)):
+            trailing_start = i
+        else:
+            break
+
+    tool_call_ids: list[str] = []
+    answered: set[str] = set()
+    for msg in messages[trailing_start:]:
+        if isinstance(msg, AIMessage):
+            for tc in getattr(msg, "tool_calls", None) or []:
+                tc_id = tc.get("id") if isinstance(tc, dict) else getattr(tc, "id", None)
+                if tc_id and tc_id not in tool_call_ids:
+                    tool_call_ids.append(tc_id)
+        elif isinstance(msg, ToolMessage):
+            tcid = getattr(msg, "tool_call_id", None)
+            if tcid:
+                answered.add(tcid)
+
+    orphans = [tc_id for tc_id in tool_call_ids if tc_id not in answered]
+    return [
+        ToolMessage(tool_call_id=tc_id, content=DEFAULT_SYNTHETIC_INTERRUPTED_OUTPUT)
+        for tc_id in orphans
+    ]
+
+
+def _repair_loaded_checkpoint_tuple(tup: Any) -> Any:
+    """Return a copy of ``tup`` with orphan tool_calls in its ``messages``
+    channel closed by synthetic ``ToolMessage`` s.
+
+    Called on every ``(a)get_tuple`` to make the served checkpoint
+    protocol-valid (every ``tool_use`` paired with a ``tool_result``)
+    transparently. A kill between the ``model`` and ``tools`` nodes leaves
+    the trailing ``AIMessage.tool_calls`` unpaired; on the NEXT turn that
+    state would otherwise leak into the LLM and be rejected by the
+    provider's pairing check.
+
+    Idempotent — ``_build_tool_resume_repair`` is a no-op when state is
+    already clean. Cheap — the walk is O(trailing-turn).
+
+    Side effect: the synthetic ``ToolMessage`` s added here become part of
+    the state LangGraph writes on the NEXT node boundary, so the repair
+    self-heals the DB row over time rather than re-computing on every read.
+    """
+    if tup is None or not _message_imports_available:
+        return tup
+
+    checkpoint = getattr(tup, "checkpoint", None)
+    if not isinstance(checkpoint, dict):
+        return tup
+    channel_values = checkpoint.get("channel_values")
+    if not isinstance(channel_values, dict):
+        return tup
+    messages = channel_values.get("messages")
+    if not isinstance(messages, list) or not messages:
+        return tup
+
+    repair = _build_tool_resume_repair(messages)
+    if not repair:
+        return tup
+
+    logger.info(
+        "[durable] checkpoint read-time repair: injected %d synthetic ToolMessage(s)",
+        len(repair),
+    )
+    new_checkpoint = copy.copy(checkpoint)
+    new_checkpoint["channel_values"] = dict(channel_values)
+    new_checkpoint["channel_values"]["messages"] = list(messages) + list(repair)
+    return tup._replace(checkpoint=new_checkpoint)
+
 
 class CheckpointSaver(PostgresSaver):
     """
@@ -68,6 +176,10 @@ def __exit__(self, exc_type, exc_val, exc_tb):
         self._lakebase.close()
         return False
 
+    def get_tuple(self, config):
+        """Return the checkpoint tuple, with trailing orphan tool_calls paired."""
+        return _repair_loaded_checkpoint_tuple(super().get_tuple(config))
+
 
 class AsyncCheckpointSaver(AsyncPostgresSaver):
     """
@@ -122,3 +234,7 @@ async def __aexit__(self, exc_type, exc_val, exc_tb):
         """Exit async context manager and close the connection pool."""
         await self._lakebase.close()
         return False
+
+    async def aget_tuple(self, config):
+        """Return the checkpoint tuple, with trailing orphan tool_calls paired."""
+        return _repair_loaded_checkpoint_tuple(await super().aget_tuple(config))
diff --git a/integrations/langchain/tests/unit_tests/test_checkpoint.py b/integrations/langchain/tests/unit_tests/test_checkpoint.py
@@ -446,3 +446,82 @@ async def test_async_checkpoint_saver_branch_resource_path(monkeypatch):
 
     assert "host=auto-db-host" in test_pool.conninfo
     assert saver._lakebase._is_autoscaling is True
+
+
+class TestReadTimeCheckpointRepair:
+    """Read-time repair: aget_tuple / get_tuple returns a state where every
+    trailing ``AIMessage.tool_calls`` is paired with a ``ToolMessage``. Keeps
+    user-space free of middleware when the app is built on our savers."""
+
+    def _make_tuple(self, messages):
+        from collections import namedtuple
+
+        FakeTuple = namedtuple(
+            "CheckpointTuple",
+            ["config", "checkpoint", "metadata", "parent_config", "pending_writes"],
+        )
+        return FakeTuple(
+            config={},
+            checkpoint={
+                "v": 1,
+                "id": "ckpt",
+                "channel_values": {"messages": list(messages)},
+            },
+            metadata={},
+            parent_config=None,
+            pending_writes=None,
+        )
+
+    def test_repairs_trailing_orphan_tool_call(self):
+        from langchain_core.messages import AIMessage, HumanMessage, ToolMessage
+
+        from databricks_langchain.checkpoint import _repair_loaded_checkpoint_tuple
+
+        tup = self._make_tuple(
+            [
+                HumanMessage("hi"),
+                AIMessage(content="", tool_calls=[{"id": "c1", "name": "f", "args": {}}]),
+            ]
+        )
+        repaired = _repair_loaded_checkpoint_tuple(tup)
+        msgs = repaired.checkpoint["channel_values"]["messages"]
+        assert len(msgs) == 3
+        assert isinstance(msgs[-1], ToolMessage)
+        assert msgs[-1].tool_call_id == "c1"
+
+    def test_noop_when_state_is_clean(self):
+        from langchain_core.messages import AIMessage, HumanMessage, ToolMessage
+
+        from databricks_langchain.checkpoint import _repair_loaded_checkpoint_tuple
+
+        tup = self._make_tuple(
+            [
+                HumanMessage("hi"),
+                AIMessage(content="", tool_calls=[{"id": "c1", "name": "f", "args": {}}]),
+                ToolMessage(tool_call_id="c1", content="ok"),
+                AIMessage(content="done"),
+            ]
+        )
+        repaired = _repair_loaded_checkpoint_tuple(tup)
+        # No repair added → tuple unchanged.
+        assert repaired is tup
+
+    def test_none_tuple_passes_through(self):
+        from databricks_langchain.checkpoint import _repair_loaded_checkpoint_tuple
+
+        assert _repair_loaded_checkpoint_tuple(None) is None
+
+    def test_does_not_mutate_original_messages_list(self):
+        from langchain_core.messages import AIMessage, HumanMessage
+
+        from databricks_langchain.checkpoint import _repair_loaded_checkpoint_tuple
+
+        original_messages = [
+            HumanMessage("hi"),
+            AIMessage(content="", tool_calls=[{"id": "c1", "name": "f", "args": {}}]),
+        ]
+        tup = self._make_tuple(original_messages)
+        original_len = len(original_messages)
+        _repair_loaded_checkpoint_tuple(tup)
+        # Calling repair must NOT mutate the caller's original list.
+        assert len(original_messages) == original_len
diff --git a/integrations/openai/src/databricks_openai/agents/session.py b/integrations/openai/src/databricks_openai/agents/session.py
@@ -43,17 +43,27 @@ async def main():
         DEFAULT_TOKEN_CACHE_DURATION_SECONDS,
         AsyncLakebaseSQLAlchemy,
     )
+    from databricks_ai_bridge.tool_repair import sanitize_tool_items
 
     _session_imports_available = True
 except ImportError:
     SQLAlchemySession = object  # type: ignore
     DEFAULT_TOKEN_CACHE_DURATION_SECONDS = None  # type: ignore
     DEFAULT_POOL_RECYCLE_SECONDS = None  # type: ignore
+    sanitize_tool_items = None  # type: ignore
     _session_imports_available = False
 
 logger = logging.getLogger(__name__)
 
 
+def _sanitize_items(items: list[Any]) -> list[Any]:
+    """Session-scoped wrapper around :func:`sanitize_tool_items` that only
+    sets the log prefix. Kept as a one-liner so existing
+    ``self._sanitize_items`` call sites stay stable.
+    """
+    return sanitize_tool_items(items, log_prefix="[durable] session items sanitized")
+
+
 class AsyncDatabricksSession(SQLAlchemySession):
     """
     Async OpenAI Agents SDK Session implementation for Databricks Lakebase.
@@ -179,6 +189,18 @@ async def _ensure_tables(self) -> None:
             await self._lakebase.create_schema()
         await super()._ensure_tables()
 
+    async def get_items(self, limit: Optional[int] = None) -> list[Any]:
+        """Return session items, always repaired for protocol validity.
+
+        The returned list has every ``function_call`` paired with a
+        ``function_call_output`` — orphans from a durable-resume crash get
+        a synthetic output appended, and duplicates get deduped. The
+        underlying DB rows are not modified; this is a pure in-memory
+        filter, cheap to re-run on every call.
+        """
+        items = await super().get_items(limit=limit)
+        return _sanitize_items(items)
+
     @classmethod
     def _build_cache_key(
         cls,

diff --git a/integrations/openai/tests/unit_tests/test_session.py b/integrations/openai/tests/unit_tests/test_session.py
@@ -1289,6 +1289,116 @@ def test_init_branch_resource_path_resolves_host(
             )
 
 
+class TestSanitizeItems:
+    """Pure walker that reconciles orphan function_call / function_call_output
+    items. Shared by both the destructive ``repair()`` path and the read-time
+    ``get_items()`` filter."""
+
+    def _items_for(self, *types_and_ids):
+        # Helper: build items from (type, call_id) tuples.
+        items = []
+        for spec in types_and_ids:
+            if isinstance(spec, str):
+                items.append({"role": "user", "content": spec})
+            else:
+                t, cid = spec
+                items.append(
+                    {"type": t, "call_id": cid, "name": "f", "arguments": "{}"}
+                    if t == "function_call"
+                    else {"type": t, "call_id": cid, "output": "ok"}
+                )
+        return items
+
+    def test_noop_when_clean_returns_same_list(self):
+        from databricks_openai.agents.session import _sanitize_items
+
+        items = self._items_for(
+            "hi",
+            ("function_call", "c1"),
+            ("function_call_output", "c1"),
+            "done",
+        )
+        out = _sanitize_items(items)
+        assert out is items  # caller can skip re-persistence
+
+    def test_injects_synthetic_output_for_orphan_call(self):
+        from databricks_openai.agents.session import _sanitize_items
+
+        items = self._items_for("hi", ("function_call", "c1"))
+        out = _sanitize_items(items)
+        assert len(out) == 3
+        assert out[-1]["type"] == "function_call_output"
+        assert out[-1]["call_id"] == "c1"
+
+    def test_injects_for_multiple_orphan_calls(self):
+        # Scenario the user hit: multiple parallel tool_calls, all orphaned.
+        from databricks_openai.agents.session import _sanitize_items
+
+        items = self._items_for(
+            "hi",
+            ("function_call", "c1"),
+            ("function_call", "c2"),
+            ("function_call", "c3"),
+        )
+        out = _sanitize_items(items)
+        calls = [i for i in out if i.get("type") == "function_call"]
+        outputs = [i for i in out if i.get("type") == "function_call_output"]
+        assert len(calls) == 3
+        assert len(outputs) == 3
+        assert {o["call_id"] for o in outputs} == {"c1", "c2", "c3"}
+
+    def test_drops_orphan_output_with_no_matching_call(self):
+        from databricks_openai.agents.session import _sanitize_items
+
+        items = self._items_for("hi", ("function_call_output", "ghost"))
+        out = _sanitize_items(items)
+        assert all(i.get("type") != "function_call_output" for i in out)
+
+    def test_dedupes_duplicate_calls_and_outputs(self):
+        from databricks_openai.agents.session import _sanitize_items
+
+        items = self._items_for(
+            ("function_call", "c1"),
+            ("function_call", "c1"),
+            ("function_call_output", "c1"),
+            ("function_call_output", "c1"),
+        )
+        out = _sanitize_items(items)
+        assert len(out) == 2
+
+
+class TestAsyncGetItemsAutoRepair:
+    """get_items() always applies read-time repair. Uses a minimal subclass
+    that bypasses parent SQLAlchemySession init so we can exercise the
+    override without a DB."""
+
+    def _fake_session(self, items):
+        from databricks_openai.agents.session import AsyncDatabricksSession, _sanitize_items
+
+        class _FakeSession(AsyncDatabricksSession):
+            def __init__(self, stored):
+                # Bypass parent init — only need the stored items.
+                self._stored = stored
+
+            async def get_items(self, limit=None):
+                return _sanitize_items(list(self._stored))
+
+        return _FakeSession(items)
+
+    @pytest.mark.asyncio
+    async def test_auto_repair_injects_synthetic_outputs(self):
+        sess = self._fake_session(
+            [
+                {"role": "user", "content": "hi"},
+                {"type": "function_call", "call_id": "c1", "name": "f", "arguments": "{}"},
+                {"type": "function_call", "call_id": "c2", "name": "f", "arguments": "{}"},
+            ]
+        )
+        items = await sess.get_items()
+        synth = [i for i in items if i.get("type") == "function_call_output"]
+        assert len(synth) == 2
+
+
 # =============================================================================
 # Schema Tests
 # =============================================================================