Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 19 additions & 1 deletion src/utils/agent_tools.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import asyncio
import logging
import weakref
from collections.abc import Callable
from collections.abc import Callable, Sequence
from dataclasses import dataclass
from datetime import datetime
from typing import Any, cast
Expand Down Expand Up @@ -40,6 +40,21 @@ def _normalized_observation_input(
return obs.model_copy(update={"content": obs.content.strip()})


def _dedupe_observation_docs_for_tool_output(
documents: Sequence[Document],
) -> list[Document]:
"""Remove exact duplicate observations from tool output without mutating storage."""
seen: set[tuple[str, str]] = set()
deduped: list[Document] = []
for doc in documents:
key = (doc.level or "explicit", " ".join(doc.content.casefold().split()))
if key in seen:
continue
seen.add(key)
deduped.append(doc)
return deduped


def _base_observation_properties() -> dict[str, Any]:
return {
"content": {
Expand Down Expand Up @@ -1957,6 +1972,7 @@ async def _handle_get_reasoning_chain(
db, ctx.workspace_name, doc.source_ids
)
if premises:
premises = _dedupe_observation_docs_for_tool_output(premises)
premise_lines: list[Any] = []
for p in premises:
p_level = p.level or "explicit"
Expand All @@ -1976,6 +1992,7 @@ async def _handle_get_reasoning_chain(
db, ctx.workspace_name, doc.source_ids
)
if sources:
sources = _dedupe_observation_docs_for_tool_output(sources)
source_lines: list[Any] = []
for s in sources:
s_level = s.level or "explicit"
Expand Down Expand Up @@ -2004,6 +2021,7 @@ async def _handle_get_reasoning_chain(
observed=ctx.observed,
)
if children:
children = _dedupe_observation_docs_for_tool_output(children)
child_lines: list[Any] = []
for c in children:
c_level = c.level or "explicit"
Expand Down
27 changes: 27 additions & 0 deletions src/utils/representation.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,32 @@ def _strip_microseconds_and_timezone(timestamp: datetime) -> datetime:
return timestamp.replace(microsecond=0, tzinfo=None)


def _normalize_observation_content(content: str) -> str:
"""Normalize observation content for retrieval-time exact deduplication."""
return " ".join(content.casefold().split())


def _dedupe_documents_for_representation(
documents: Sequence[models.Document],
) -> list[models.Document]:
"""
Remove exact duplicate observations before formatting LLM-facing context.

This is intentionally retrieval-time only: it does not mutate the database,
vector store, or source/premise links. Keep the first document in the input
order so semantic/recent/derived ranking remains stable.
"""
seen: set[tuple[str, str]] = set()
deduped: list[models.Document] = []
for doc in documents:
key = (doc.level or "explicit", _normalize_observation_content(doc.content))
if key in seen:
continue
seen.add(key)
deduped.append(doc)
return deduped


def flatten_message_ids(
message_ids: list[int] | list[list[int]] | list[tuple[int, int]],
) -> list[int]:
Expand Down Expand Up @@ -581,6 +607,7 @@ def format_as_markdown(self, include_ids: bool = False) -> str:

@classmethod
def from_documents(cls, documents: Sequence[models.Document]) -> "Representation":
documents = _dedupe_documents_for_representation(documents)
return cls(
explicit=[
ExplicitObservation(
Expand Down
32 changes: 32 additions & 0 deletions tests/utils/test_representation_retrieval_dedupe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from __future__ import annotations

import datetime as dt
from types import SimpleNamespace

from src.utils.representation import Representation


def _doc(doc_id: str, level: str, content: str):
return SimpleNamespace(
id=doc_id,
level=level,
content=content,
created_at=dt.datetime(2026, 1, 1, 12, 0, tzinfo=dt.UTC),
internal_metadata={"message_ids": [1]},
session_name="s1",
source_ids=None,
)


def test_representation_from_documents_dedupes_exact_normalized_content():
representation = Representation.from_documents(
[
_doc("doc-a", "explicit", "User prefers concise responses."),
_doc("doc-b", "explicit", " user prefers CONCISE responses. "),
_doc("doc-c", "deductive", "User prefers concise responses."),
]
)

assert [obs.id for obs in representation.explicit] == ["doc-a"]
# Same normalized content in a different observation level is retained.
assert [obs.id for obs in representation.deductive] == ["doc-c"]