diff --git a/src/utils/agent_tools.py b/src/utils/agent_tools.py index 0d95cf6d0..e3544568c 100644 --- a/src/utils/agent_tools.py +++ b/src/utils/agent_tools.py @@ -1,7 +1,7 @@ import asyncio import logging import weakref -from collections.abc import Callable +from collections.abc import Callable, Sequence from dataclasses import dataclass from datetime import datetime from typing import Any, cast @@ -40,6 +40,21 @@ def _normalized_observation_input( return obs.model_copy(update={"content": obs.content.strip()}) +def _dedupe_observation_docs_for_tool_output( + documents: Sequence[Document], +) -> list[Document]: + """Remove exact duplicate observations from tool output without mutating storage.""" + seen: set[tuple[str, str]] = set() + deduped: list[Document] = [] + for doc in documents: + key = (doc.level or "explicit", " ".join(doc.content.casefold().split())) + if key in seen: + continue + seen.add(key) + deduped.append(doc) + return deduped + + def _base_observation_properties() -> dict[str, Any]: return { "content": { @@ -1957,6 +1972,7 @@ async def _handle_get_reasoning_chain( db, ctx.workspace_name, doc.source_ids ) if premises: + premises = _dedupe_observation_docs_for_tool_output(premises) premise_lines: list[Any] = [] for p in premises: p_level = p.level or "explicit" @@ -1976,6 +1992,7 @@ async def _handle_get_reasoning_chain( db, ctx.workspace_name, doc.source_ids ) if sources: + sources = _dedupe_observation_docs_for_tool_output(sources) source_lines: list[Any] = [] for s in sources: s_level = s.level or "explicit" @@ -2004,6 +2021,7 @@ async def _handle_get_reasoning_chain( observed=ctx.observed, ) if children: + children = _dedupe_observation_docs_for_tool_output(children) child_lines: list[Any] = [] for c in children: c_level = c.level or "explicit" diff --git a/src/utils/representation.py b/src/utils/representation.py index 674b25bb4..7c4cf6aa5 100644 --- a/src/utils/representation.py +++ b/src/utils/representation.py @@ -15,6 +15,32 @@ def _strip_microseconds_and_timezone(timestamp: datetime) -> datetime: return timestamp.replace(microsecond=0, tzinfo=None) +def _normalize_observation_content(content: str) -> str: + """Normalize observation content for retrieval-time exact deduplication.""" + return " ".join(content.casefold().split()) + + +def _dedupe_documents_for_representation( + documents: Sequence[models.Document], +) -> list[models.Document]: + """ + Remove exact duplicate observations before formatting LLM-facing context. + + This is intentionally retrieval-time only: it does not mutate the database, + vector store, or source/premise links. Keep the first document in the input + order so semantic/recent/derived ranking remains stable. + """ + seen: set[tuple[str, str]] = set() + deduped: list[models.Document] = [] + for doc in documents: + key = (doc.level or "explicit", _normalize_observation_content(doc.content)) + if key in seen: + continue + seen.add(key) + deduped.append(doc) + return deduped + + def flatten_message_ids( message_ids: list[int] | list[list[int]] | list[tuple[int, int]], ) -> list[int]: @@ -581,6 +607,7 @@ def format_as_markdown(self, include_ids: bool = False) -> str: @classmethod def from_documents(cls, documents: Sequence[models.Document]) -> "Representation": + documents = _dedupe_documents_for_representation(documents) return cls( explicit=[ ExplicitObservation( diff --git a/tests/utils/test_representation_retrieval_dedupe.py b/tests/utils/test_representation_retrieval_dedupe.py new file mode 100644 index 000000000..67a2d4439 --- /dev/null +++ b/tests/utils/test_representation_retrieval_dedupe.py @@ -0,0 +1,32 @@ +from __future__ import annotations + +import datetime as dt +from types import SimpleNamespace + +from src.utils.representation import Representation + + +def _doc(doc_id: str, level: str, content: str): + return SimpleNamespace( + id=doc_id, + level=level, + content=content, + created_at=dt.datetime(2026, 1, 1, 12, 0, tzinfo=dt.UTC), + internal_metadata={"message_ids": [1]}, + session_name="s1", + source_ids=None, + ) + + +def test_representation_from_documents_dedupes_exact_normalized_content(): + representation = Representation.from_documents( + [ + _doc("doc-a", "explicit", "User prefers concise responses."), + _doc("doc-b", "explicit", " user prefers CONCISE responses. "), + _doc("doc-c", "deductive", "User prefers concise responses."), + ] + ) + + assert [obs.id for obs in representation.explicit] == ["doc-a"] + # Same normalized content in a different observation level is retained. + assert [obs.id for obs in representation.deductive] == ["doc-c"]