diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
new file mode 100644
index 0000000..d923427
--- /dev/null
+++ b/.github/workflows/release.yml
@@ -0,0 +1,91 @@
+name: Release
+
+# Triggered when a version tag is pushed, e.g.:
+# git tag v0.7.5.0 && git push origin v0.7.5.0
+on:
+ push:
+ tags:
+ - "v*"
+
+permissions:
+ contents: read
+
+jobs:
+ # 1. Build the sdist + wheel and verify the tag matches pyproject version.
+ build:
+ runs-on: ubuntu-latest
+ steps:
+ - name: Checkout code
+ uses: actions/checkout@v4
+
+ - name: Install uv
+ uses: astral-sh/setup-uv@v3
+ with:
+ version: "latest"
+
+ - name: Set up Python
+ uses: actions/setup-python@v4
+ with:
+ python-version: "3.13"
+
+ - name: Verify tag matches pyproject version
+ run: |
+ TAG="${GITHUB_REF_NAME#v}"
+ PKG=$(uv version --short 2>/dev/null || python -c "import tomllib;print(tomllib.load(open('pyproject.toml','rb'))['project']['version'])")
+ echo "Tag version: $TAG"
+ echo "Package version: $PKG"
+ if [ "$TAG" != "$PKG" ]; then
+ echo "::error::Tag ($TAG) does not match pyproject.toml version ($PKG)."
+ exit 1
+ fi
+
+ - name: Build sdist and wheel
+ run: uv build
+
+ - name: Check distribution metadata
+ run: uvx twine check dist/*
+
+ - name: Upload build artifacts
+ uses: actions/upload-artifact@v4
+ with:
+ name: dist
+ path: dist/
+
+ # 2. Publish to PyPI via Trusted Publishing (OIDC, no API token needed).
+ pypi:
+ needs: build
+ runs-on: ubuntu-latest
+ environment:
+ name: pypi
+ url: https://pypi.org/project/10xscale-agentflow/
+ permissions:
+ id-token: write # required for trusted publishing
+ steps:
+ - name: Download build artifacts
+ uses: actions/download-artifact@v4
+ with:
+ name: dist
+ path: dist/
+
+ - name: Publish to PyPI
+ uses: pypa/gh-action-pypi-publish@release/v1
+
+ # 3. Create the GitHub Release with auto-generated notes and attach artifacts.
+ github-release:
+ needs: pypi
+ runs-on: ubuntu-latest
+ permissions:
+ contents: write # required to create a release
+ steps:
+ - name: Download build artifacts
+ uses: actions/download-artifact@v4
+ with:
+ name: dist
+ path: dist/
+
+ - name: Create GitHub Release
+ uses: softprops/action-gh-release@v2
+ with:
+ generate_release_notes: true
+ files: dist/*
+ fail_on_unmatched_files: true
diff --git a/agentflow/core/graph/utils/invoke_handler.py b/agentflow/core/graph/utils/invoke_handler.py
index 47645e3..309e251 100644
--- a/agentflow/core/graph/utils/invoke_handler.py
+++ b/agentflow/core/graph/utils/invoke_handler.py
@@ -11,6 +11,7 @@
from agentflow.core.graph.edge import Edge
from agentflow.core.graph.node import Node
from agentflow.core.graph.utils.utils import (
+ calculate_token_usage,
call_realtime_sync,
get_next_node,
load_or_create_state,
@@ -479,8 +480,12 @@ async def invoke(
final_state, messages = await self._execute_graph(state, config)
logger.info("Graph execution completed with %d final messages", len(messages))
+ # Calculate token usage
+ token_usage = calculate_token_usage(messages)
+
event.event_type = EventType.END
event.metadata["status"] = "Graph execution completed"
+ event.metadata.update(token_usage)
event.data["state"] = final_state.model_dump()
event.data["messages"] = [m.model_dump() for m in messages] if messages else []
publish_event(event)
@@ -489,6 +494,7 @@ async def invoke(
final_state,
messages,
response_granularity,
+ token_usage=token_usage,
)
except Exception as e:
logger.exception("Graph execution failed: %s", e)
diff --git a/agentflow/core/graph/utils/stream_handler.py b/agentflow/core/graph/utils/stream_handler.py
index bdb5710..ee8cafc 100644
--- a/agentflow/core/graph/utils/stream_handler.py
+++ b/agentflow/core/graph/utils/stream_handler.py
@@ -39,6 +39,7 @@
InterruptConfigMixin,
)
from .utils import (
+ calculate_token_usage,
call_realtime_sync,
get_next_node,
load_or_create_state,
@@ -589,6 +590,7 @@ async def _execute_graph( # noqa: PLR0912, PLR0915
event.metadata["is_context_trimmed"] = is_context_trimmed
publish_event(event)
+ # Include messages list for token calculation in stream method
yield StreamChunk(
event=StreamEvent.UPDATES,
state=state,
@@ -599,6 +601,8 @@ async def _execute_graph( # noqa: PLR0912, PLR0915
"max_steps": max_steps,
"is_context_trimmed": is_context_trimmed,
"reason": "Graph execution completed successfully",
+ # Internal: messages from current run for token calculation
+ "_messages": messages,
},
thread_id=config.get("thread_id"),
run_id=config.get("run_id"),
@@ -747,8 +751,19 @@ async def stream(
logger.debug("Beginning graph execution")
result = self._execute_graph(state, input_data, config)
+ # Track messages from current run for token calculation
+ current_run_messages = []
+
# Stream results based on response granularity
async for chunk in result:
+ # Extract messages from final completion chunk (internal use only)
+ if (
+ chunk.event == StreamEvent.UPDATES
+ and chunk.data
+ and chunk.data.get("status") == "graph_invoked"
+ ):
+ current_run_messages = chunk.data.pop("_messages", [])
+
match response_granularity:
case ResponseGranularity.FULL:
yield chunk
@@ -763,6 +778,9 @@ async def stream(
time_taken = time.time() - start_time
logger.info("Graph execution finished in %.2f seconds", time_taken)
+ # Calculate token usage from current run messages only
+ token_usage = calculate_token_usage(current_run_messages)
+
event.event_type = EventType.END
event.metadata.update(
{
@@ -772,19 +790,22 @@ async def stream(
"current_node": state.execution_meta.current_node,
"is_interrupted": state.is_interrupted(),
"total_messages": len(state.context) if state.context else 0,
+ **token_usage,
}
)
publish_event(event)
- yield StreamChunk(
- event=StreamEvent.UPDATES,
- state=state,
- data={
- "status": "graph_invoked",
- "reason": "Graph execution finished",
- "time_taken": time_taken,
- "is_interrupted": state.is_interrupted(),
- "total_messages": len(state.context) if state.context else 0,
- },
- thread_id=config.get("thread_id"),
- run_id=config.get("run_id"),
- )
+ if response_granularity == ResponseGranularity.FULL:
+ yield StreamChunk(
+ event=StreamEvent.UPDATES,
+ state=state,
+ data={
+ "status": "graph_invoked",
+ "reason": "Graph execution finished",
+ "time_taken": time_taken,
+ "is_interrupted": state.is_interrupted(),
+ "total_messages": len(state.context) if state.context else 0,
+ **token_usage,
+ },
+ thread_id=config.get("thread_id"),
+ run_id=config.get("run_id"),
+ )
diff --git a/agentflow/core/graph/utils/utils.py b/agentflow/core/graph/utils/utils.py
index 6060310..b808471 100644
--- a/agentflow/core/graph/utils/utils.py
+++ b/agentflow/core/graph/utils/utils.py
@@ -50,6 +50,7 @@ async def parse_response(
state: AgentState,
messages: list[Message],
response_granularity: ResponseGranularity = ResponseGranularity.LOW,
+ token_usage: dict[str, int] | None = None,
) -> dict[str, Any]:
"""Parse and format execution response based on specified granularity level.
@@ -83,19 +84,20 @@ async def parse_response(
match response_granularity:
case ResponseGranularity.FULL:
# Return full state and messages
- return {"state": state, "messages": messages}
+ return {"state": state, "messages": messages, "token_usage": token_usage}
case ResponseGranularity.PARTIAL:
# Return state and summary of messages
return {
"context": state.context,
"summary": state.context_summary,
- "message": messages,
+ "messages": messages,
+ "token_usage": token_usage,
}
case ResponseGranularity.LOW:
# Return all messages from state context
- return {"messages": messages}
+ return {"messages": messages, "token_usage": token_usage}
- return {"messages": messages}
+ return {"messages": messages, "token_usage": token_usage}
# Utility to update only provided fields in state
@@ -577,3 +579,60 @@ async def sync_data(
await checkpointer.aput_messages(config, messages)
return is_context_trimmed
+
+
+def calculate_token_usage(messages: list[Message]) -> dict[str, int]:
+ """Calculate total token usage from all messages in the state.
+
+ Aggregates token usage across all messages in the state's context,
+ including input tokens (prompt_tokens), output tokens (completion_tokens),
+ and reasoning tokens.
+
+ Args:
+ messages: The list of messages containing token usage information.
+
+ Returns:
+ Dictionary containing:
+ - total_input_tokens: Total prompt/input tokens used
+ - total_output_tokens: Total completion/output tokens used
+ - total_reasoning_tokens: Total reasoning tokens used
+ - total_tokens: Sum of input and output tokens
+
+ Example:
+ ```python
+ usage = calculate_token_usage(state)
+ # Returns: {
+ # "total_input_tokens": 1500,
+ # "total_output_tokens": 800,
+ # "total_reasoning_tokens": 200,
+ # "total_tokens": 2300
+ # }
+ ```
+ """
+ total_input_tokens = 0
+ total_output_tokens = 0
+ total_reasoning_tokens = 0
+
+ if not messages:
+ return {
+ "total_input_tokens": total_input_tokens,
+ "total_output_tokens": total_output_tokens,
+ "total_reasoning_tokens": total_reasoning_tokens,
+ "total_tokens": total_input_tokens + total_output_tokens,
+ }
+
+ for message in messages:
+ if message.usages:
+ total_input_tokens += message.usages.prompt_tokens
+ total_output_tokens += message.usages.completion_tokens
+ total_reasoning_tokens += message.usages.reasoning_tokens
+
+ # Note: total_tokens is input + output only (reasoning tracked separately)
+ total_tokens = total_input_tokens + total_output_tokens
+
+ return {
+ "total_input_tokens": total_input_tokens,
+ "total_output_tokens": total_output_tokens,
+ "total_reasoning_tokens": total_reasoning_tokens,
+ "total_tokens": total_tokens,
+ }
diff --git a/agentflow/runtime/protocols/__init__.py b/agentflow/runtime/protocols/__init__.py
index c7914bb..11fec79 100644
--- a/agentflow/runtime/protocols/__init__.py
+++ b/agentflow/runtime/protocols/__init__.py
@@ -4,23 +4,22 @@
``agentflow.runtime.protocols.a2a``.
"""
-from . import a2a
-from .a2a import (
- AgentFlowExecutor,
- build_a2a_app,
- create_a2a_client_node,
- create_a2a_server,
- delegate_to_a2a_agent,
- make_agent_card,
-)
+# from . import a2a
+# from .a2a import (
+# AgentFlowExecutor,
+# build_a2a_app,
+# create_a2a_client_node,
+# create_a2a_server,
+# delegate_to_a2a_agent,
+# make_agent_card,
+# )
-
-__all__ = [
- "AgentFlowExecutor",
- "a2a",
- "build_a2a_app",
- "create_a2a_client_node",
- "create_a2a_server",
- "delegate_to_a2a_agent",
- "make_agent_card",
-]
+# __all__ = [
+# "AgentFlowExecutor",
+# "a2a",
+# "build_a2a_app",
+# "create_a2a_client_node",
+# "create_a2a_server",
+# "delegate_to_a2a_agent",
+# "make_agent_card",
+# ]
diff --git a/agentflow/runtime/protocols/a2a/README.md b/agentflow/runtime/protocols/a2a/README.md
deleted file mode 100644
index e5db421..0000000
--- a/agentflow/runtime/protocols/a2a/README.md
+++ /dev/null
@@ -1,445 +0,0 @@
-# agentflow.a2a_integration
-
-> **Official bridge between agentflow and the [A2A (Agent-to-Agent) protocol](https://github.com/google/A2A) via the `a2a-sdk`.**
-
-This package lets you **expose** any agentflow `CompiledGraph` as an A2A-compliant agent, and **call** remote A2A agents from within agentflow graphs — all with minimal boilerplate.
-
----
-
-## Table of Contents
-
-- [Installation](#installation)
-- [Architecture Overview](#architecture-overview)
-- [Module Reference](#module-reference)
- - [executor.py — AgentFlowExecutor](#executorpy--agentflowexecutor)
- - [server.py — Server Helpers](#serverpy--server-helpers)
- - [client.py — Client Helpers](#clientpy--client-helpers)
-- [Quick Start](#quick-start)
- - [Serving a Graph as an A2A Agent](#serving-a-graph-as-an-a2a-agent)
- - [Calling a Remote A2A Agent](#calling-a-remote-a2a-agent)
- - [Using a Remote Agent as a Graph Node](#using-a-remote-agent-as-a-graph-node)
-- [Conversation Memory (context_id)](#conversation-memory-context_id)
-- [Streaming](#streaming)
-- [Custom Executors](#custom-executors)
-- [Examples](#examples)
-- [API Summary](#api-summary)
-
----
-
-## Installation
-
-The A2A integration requires the `a2a-sdk` extra:
-
-```bash
-pip install agentflow[a2a_sdk]
-```
-
-This installs `a2a-sdk`, `httpx`, `uvicorn`, and all transitive dependencies.
-
----
-
-## Architecture Overview
-
-```
-┌───────────────────────────────────────────────────────────────┐
-│ A2A Client │
-│ (any A2A-compliant client, browser, or another agent) │
-└──────────────┬────────────────────────────────────────────────┘
- │ JSON-RPC / SSE over HTTP
- ▼
-┌──────────────────────────────────────────────────────────────┐
-│ a2a-sdk layer (transport, JSON-RPC, task lifecycle, SSE) │
-│ ┌────────────────────────────────────────────────────────┐ │
-│ │ DefaultRequestHandler + InMemoryTaskStore │ │
-│ └────────────────────┬───────────────────────────────────┘ │
-│ │ calls execute() │
-│ ┌────────────────────▼───────────────────────────────────┐ │
-│ │ AgentFlowExecutor (executor.py) │ │
-│ │ ─ extracts user text from A2A message parts │ │
-│ │ ─ resolves thread_id from context_id │ │
-│ │ ─ runs CompiledGraph.ainvoke() or .astream() │ │
-│ │ ─ pushes results back via TaskUpdater │ │
-│ └────────────────────┬───────────────────────────────────┘ │
-└───────────────────────┼──────────────────────────────────────┘
- │
- ┌────────▼────────┐
- │ CompiledGraph │ ← your agentflow graph
- │ (any topology) │
- └─────────────────┘
-```
-
-**Key insight:** The `a2a-sdk` owns all transport (HTTP, JSON-RPC, SSE, task state machines). Agentflow owns all agent logic (LLM calls, tool execution, state management, checkpointing). `AgentFlowExecutor` is the sole bridge connecting the two.
-
----
-
-## Module Reference
-
-### `executor.py` — AgentFlowExecutor
-
-The core bridge class. Implements the `a2a-sdk`'s `AgentExecutor` interface.
-
-```python
-class AgentFlowExecutor(AgentExecutor):
- def __init__(
- self,
- compiled_graph: CompiledGraph,
- config: dict[str, Any] | None = None,
- streaming: bool = False,
- ) -> None: ...
-```
-
-| Parameter | Type | Description |
-|-----------|------|-------------|
-| `compiled_graph` | `CompiledGraph` | A fully compiled agentflow graph |
-| `config` | `dict` | Base config forwarded to `ainvoke`/`astream` (e.g. `recursion_limit`) |
-| `streaming` | `bool` | When `True`, uses `astream` + sends `TaskState.working` progress events |
-
-**What it does on each A2A request:**
-
-1. Creates a `TaskUpdater` and marks the task as `submitted` → `working`
-2. Extracts user text from the A2A message parts via `context.get_user_input()`
-3. Wraps the text as an agentflow `Message` list
-4. Resolves `thread_id` from `context.context_id` (falling back to `context.task_id`) for checkpointer-based conversation memory
-5. Calls `graph.ainvoke()` (blocking) or `graph.astream()` (streaming)
-6. Extracts the last assistant message from the result state
-7. Pushes it back as an A2A artifact via `updater.add_artifact()` and marks `completed`
-
-**Text extraction** walks backwards through `state.context` looking for the last `role="assistant"` message — this works regardless of how many nodes the graph has.
-
-**Error handling** catches all exceptions and pushes a `TaskState.failed` status with the error message.
-
----
-
-### `server.py` — Server Helpers
-
-Three convenience functions to expose a graph as an A2A HTTP server:
-
-#### `make_agent_card()`
-
-Builds an `AgentCard` (the A2A discovery descriptor served at `/.well-known/agent-card.json`).
-
-```python
-def make_agent_card(
- name: str,
- description: str,
- url: str,
- *,
- skills: list[AgentSkill] | None = None,
- streaming: bool = False,
- version: str = "1.0.0",
-) -> AgentCard: ...
-```
-
-If no `skills` are provided, a default `"run_graph"` skill is created automatically.
-
-#### `build_a2a_app()`
-
-Returns a **Starlette ASGI app** — useful when you need to compose the A2A endpoint with other routes (e.g. mount inside FastAPI) or run it in tests.
-
-```python
-def build_a2a_app(
- compiled_graph: CompiledGraph,
- agent_card: AgentCard,
- *,
- streaming: bool = False,
- executor_config: dict[str, Any] | None = None,
-) -> Starlette: ...
-```
-
-#### `create_a2a_server()`
-
-**One-call blocking server** — builds the app and starts `uvicorn`. Ideal for standalone agents.
-
-```python
-def create_a2a_server(
- compiled_graph: CompiledGraph,
- agent_card: AgentCard,
- *,
- host: str = "0.0.0.0",
- port: int = 9999,
- streaming: bool = False,
- executor_config: dict[str, Any] | None = None,
-) -> None: ...
-```
-
----
-
-### `client.py` — Client Helpers
-
-Utilities for calling **remote** A2A agents from within agentflow graphs.
-
-#### `delegate_to_a2a_agent()`
-
-Async one-shot helper — send text, get text back.
-
-```python
-async def delegate_to_a2a_agent(
- url: str,
- text: str,
- *,
- timeout: float = 30.0,
-) -> str: ...
-```
-
-Sends a single `TextPart` message to the remote agent and returns the response text. Raises `RuntimeError` if the agent returns an error or no text content.
-
-#### `create_a2a_client_node()`
-
-Factory that returns an **agentflow graph node function** wrapping a remote A2A agent. The returned callable has the standard node signature `(state, config) -> list[Message]`.
-
-```python
-def create_a2a_client_node(
- url: str,
- *,
- timeout: float = 30.0,
- response_role: str = "assistant",
-) -> Callable: ...
-```
-
-**Usage in a graph:**
-
-```python
-from agentflow.a2a_integration import create_a2a_client_node
-
-graph.add_node("remote_agent", create_a2a_client_node("http://localhost:9999"))
-graph.add_edge("some_node", "remote_agent")
-graph.add_edge("remote_agent", END)
-```
-
-The node reads the last message from `state.context`, forwards its text to the remote A2A agent, and returns the response as a new `Message`.
-
----
-
-## Quick Start
-
-### Serving a Graph as an A2A Agent
-
-```python
-from agentflow.graph import StateGraph
-from agentflow.state import AgentState
-from agentflow.utils.constants import END
-from agentflow.a2a_integration import (
- create_a2a_server,
- make_agent_card,
-)
-
-# 1. Build your agentflow graph
-async def my_node(state: AgentState, config: dict):
- from agentflow.state import Message
- user_text = state.context[-1].text() if state.context else ""
- return [Message.text_message(f"You said: {user_text}", role="assistant")]
-
-graph = StateGraph[AgentState](AgentState())
-graph.add_node("main", my_node)
-graph.set_entry_point("main")
-graph.add_edge("main", END)
-compiled = graph.compile()
-
-# 2. Create the agent card
-card = make_agent_card(
- name="EchoAgent",
- description="Echoes back whatever you say",
- url="http://localhost:9999",
-)
-
-# 3. Start the A2A server (blocking)
-create_a2a_server(compiled, card, port=9999)
-```
-
-The agent is now discoverable at `http://localhost:9999/.well-known/agent-card.json` and accepts JSON-RPC requests at `http://localhost:9999/`.
-
-### Calling a Remote A2A Agent
-
-```python
-from agentflow.a2a_integration import delegate_to_a2a_agent
-
-response = await delegate_to_a2a_agent(
- "http://localhost:9999",
- "Hello, agent!",
-)
-print(response) # "You said: Hello, agent!"
-```
-
-### Using a Remote Agent as a Graph Node
-
-```python
-from agentflow.a2a_integration import create_a2a_client_node
-from agentflow.graph import StateGraph
-from agentflow.state import AgentState
-from agentflow.utils.constants import END
-
-graph = StateGraph[AgentState](AgentState())
-graph.add_node("user_input", some_input_node)
-graph.add_node("remote", create_a2a_client_node("http://localhost:9999"))
-graph.set_entry_point("user_input")
-graph.add_edge("user_input", "remote")
-graph.add_edge("remote", END)
-
-compiled = graph.compile()
-result = await compiled.ainvoke({"messages": [...]})
-```
-
----
-
-## Conversation Memory (context_id)
-
-The A2A protocol has two key identifiers:
-
-| Field | Purpose |
-|-------|---------|
-| `task_id` | Unique per message/request — changes every call |
-| `context_id` | Stable per conversation session — stays the same across turns |
-
-`AgentFlowExecutor` uses `context_id` as the agentflow checkpointer's `thread_id`. This means:
-
-- **Same `context_id`** → same checkpointer thread → conversation history is restored
-- **Different `context_id`** → fresh thread → new conversation
-- **No `context_id`** → falls back to `task_id` → one-shot (no memory)
-
-```
-Turn 1: context_id="abc" → thread_id="abc" → checkpointer saves state
-Turn 2: context_id="abc" → thread_id="abc" → checkpointer restores state ✓
-Turn 3: context_id="xyz" → thread_id="xyz" → fresh conversation
-```
-
-The client is responsible for sending a consistent `context_id` across turns. The `a2a-sdk` `ClientFactory` handles this automatically when configured.
-
-### Sub-agent context_id isolation
-
-When an agent delegates to another agent (e.g. PlannerAgent → CurrencyAgent), **do not forward the caller's `context_id` directly**. Both agents would share the same checkpointer thread, causing their conversation histories to collide.
-
-Instead, derive a namespaced `context_id` for each sub-agent:
-
-```python
-# Inside a delegation tool
-planner_ctx_id = state.a2a_context_id if state else ""
-currency_ctx_id = f"{planner_ctx_id}:currency" if planner_ctx_id else ""
-result = await _send_to_currency_agent(query, context_id=currency_ctx_id)
-```
-
-This ensures:
-- PlannerAgent checkpoints under `context_id` (e.g. `"abc"`)
-- CurrencyAgent checkpoints under `"abc:currency"`
-- Both maintain independent, stable conversation memory across turns
-- The currency result is still returned to the planner's state as a tool message
-
----
-
-## Streaming
-
-When `streaming=True`, the executor uses `CompiledGraph.astream()` instead of `ainvoke()`:
-
-```python
-# Server side
-executor = AgentFlowExecutor(compiled, streaming=True)
-
-# Or via the server helper
-create_a2a_server(compiled, card, streaming=True)
-```
-
-**What happens during streaming:**
-
-1. The graph yields `StreamChunk` objects as each node completes
-2. For each chunk with a message, the executor pushes a `TaskState.working` status update via SSE
-3. After the stream completes, the final text is emitted as an artifact with `TaskState.completed`
-
-The client observes real-time progress via Server-Sent Events (SSE).
-
-**Note:** This is A2A-level streaming — one SSE `TaskState.working` event is sent per `StreamEvent.MESSAGE` chunk emitted by the graph (i.e. whenever a node produces a message). Token-level LLM streaming is a separate concern from the A2A transport layer.
-
----
-
-## Custom Executors
-
-For advanced use cases (e.g. detecting `input_required`, merging custom state fields), subclass `AgentFlowExecutor`:
-
-```python
-from agentflow.a2a_integration.executor import AgentFlowExecutor
-from a2a.server.agent_execution.context import RequestContext
-from a2a.server.events.event_queue import EventQueue
-from a2a.server.tasks.task_updater import TaskUpdater
-from a2a.types import TaskState, TextPart
-
-class MyCustomExecutor(AgentFlowExecutor):
- async def execute(self, context: RequestContext, event_queue: EventQueue) -> None:
- updater = TaskUpdater(
- event_queue=event_queue,
- task_id=context.task_id or "",
- context_id=context.context_id or "",
- )
- await updater.submit()
- await updater.start_work()
-
- user_text = context.get_user_input() if context.message else ""
- messages = [AFMessage.text_message(user_text, role="user")]
-
- run_config = {
- "thread_id": context.context_id or context.task_id or "",
- }
-
- result = await self.graph.ainvoke(
- {
- "messages": messages,
- # Merge custom fields into your AgentState subclass
- "state": {"my_custom_field": "some_value"},
- },
- config=run_config,
- )
-
- response_text = self._extract_response_text(result)
-
- # Custom logic: detect if agent needs user input
- if self._needs_user_input(result):
- msg = updater.new_agent_message(parts=[TextPart(text=response_text)])
- await updater.update_status(TaskState.input_required, message=msg)
- else:
- await updater.add_artifact([TextPart(text=response_text)])
- await updater.complete()
-```
-
-**Common customisation patterns:**
-
-| Pattern | How |
-|---------|-----|
-| **`input_required` relay** | Inspect `result["state"]` for tool/status markers, set `TaskState.input_required` |
-| **Custom state fields** | Subclass `AgentState` and pass `"state": {field: value}` in the `ainvoke` input dict to merge extra fields (e.g. `PlannerState.a2a_context_id`) into the graph state before the first node runs |
-| **Multi-agent delegation** | Use `delegate_to_a2a_agent()` or a custom tool inside a `ToolNode`; derive a namespaced `context_id` per sub-agent (e.g. `f"{ctx_id}:currency"`) to keep each agent's checkpointer thread isolated |
-| **Task metadata** | Access `context.task_id`, `context.context_id`, `context.message` for routing decisions |
-
----
-
-## Examples
-
-Working examples are in `examples/a2a_sdk/`:
-
-| Example | Description | Key Concepts |
-|---------|-------------|--------------|
-| **`currency/`** | Single ReAct agent with currency conversion tool | `AgentFlowExecutor`, `make_agent_card`, `create_a2a_server` |
-| **`pattern1_human_agent/`** | Human-in-the-loop with `input_required` | Custom executor, `TaskState.input_required`, conversation memory |
-| **`pattern2_orchestrator/`** | Central orchestrator routing to specialist agents | Multi-agent coordination, `delegate_to_a2a_agent` |
-| **`pattern3_smart_client/`** | Client-side routing to multiple agents | Client-driven orchestration, per-agent `context_id` |
-| **`pattern4_planner_delegates/`** | Planner ReAct graph delegates currency queries to CurrencyAgent via a `ToolNode` tool; detects `input_required` and relays it across agent boundaries | Custom `PlannerState` carrying `a2a_context_id`, namespaced sub-agent `context_id` (`f"{ctx}:currency"`), `_tool_requested_input` inspection of state, `TaskState.input_required` relay |
-
----
-
-## API Summary
-
-### Public Exports (`from agentflow.a2a_integration import ...`)
-
-| Name | Type | Description |
-|------|------|-------------|
-| `AgentFlowExecutor` | Class | Bridges `CompiledGraph` into A2A's `AgentExecutor` interface |
-| `make_agent_card` | Function | Builds an `AgentCard` with sensible defaults |
-| `build_a2a_app` | Function | Returns a Starlette ASGI app (composable) |
-| `create_a2a_server` | Function | One-call blocking server (uvicorn) |
-| `delegate_to_a2a_agent` | Function | Async one-shot: send text to remote agent, get text back |
-| `create_a2a_client_node` | Function | Factory returning a graph node that wraps a remote A2A agent |
-
-### Dependencies
-
-| Package | Purpose |
-|---------|---------|
-| `a2a-sdk` | A2A protocol implementation (transport, JSON-RPC, SSE, task lifecycle) |
-| `httpx` | Async HTTP client for `delegate_to_a2a_agent` |
-| `uvicorn` | ASGI server for `create_a2a_server` |
-| `starlette` | ASGI framework (via `a2a-sdk`) |
diff --git a/agentflow/runtime/protocols/a2a/__init__.py b/agentflow/runtime/protocols/a2a/__init__.py
index 22697c2..33c2b27 100644
--- a/agentflow/runtime/protocols/a2a/__init__.py
+++ b/agentflow/runtime/protocols/a2a/__init__.py
@@ -1,36 +1,36 @@
-"""Optional A2A protocol bridge for Agentflow.
+# """Optional A2A protocol bridge for Agentflow.
-This package exposes any agentflow ``CompiledGraph`` as a standard A2A
-agent using the official ``a2a-sdk`` package, and also provides client
-helpers to call remote A2A agents from within a graph.
+# This package exposes any agentflow ``CompiledGraph`` as a standard A2A
+# agent using the official ``a2a-sdk`` package, and also provides client
+# helpers to call remote A2A agents from within a graph.
-Install the extra:
+# Install the extra:
- pip install 10xscale-agentflow[a2a_sdk]
+# pip install 10xscale-agentflow[a2a_sdk]
-Quick start - server:
+# Quick start - server:
- from agentflow.runtime.protocols.a2a import (
- AgentFlowExecutor,
- create_a2a_server,
- make_agent_card,
- )
+# from agentflow.runtime.protocols.a2a import (
+# AgentFlowExecutor,
+# create_a2a_server,
+# make_agent_card,
+# )
-Quick start - client:
+# Quick start - client:
- from agentflow.runtime.protocols.a2a import delegate_to_a2a_agent
-"""
+# from agentflow.runtime.protocols.a2a import delegate_to_a2a_agent
+# """
-from .client import create_a2a_client_node, delegate_to_a2a_agent
-from .executor import AgentFlowExecutor
-from .server import build_a2a_app, create_a2a_server, make_agent_card
+# from .client import create_a2a_client_node, delegate_to_a2a_agent
+# from .executor import AgentFlowExecutor
+# from .server import build_a2a_app, create_a2a_server, make_agent_card
-__all__ = [
- "AgentFlowExecutor",
- "build_a2a_app",
- "create_a2a_client_node",
- "create_a2a_server",
- "delegate_to_a2a_agent",
- "make_agent_card",
-]
+# __all__ = [
+# "AgentFlowExecutor",
+# "build_a2a_app",
+# "create_a2a_client_node",
+# "create_a2a_server",
+# "delegate_to_a2a_agent",
+# "make_agent_card",
+# ]
diff --git a/agentflow/runtime/protocols/a2a/_optional.py b/agentflow/runtime/protocols/a2a/_optional.py
index b624117..4012fc6 100644
--- a/agentflow/runtime/protocols/a2a/_optional.py
+++ b/agentflow/runtime/protocols/a2a/_optional.py
@@ -1,36 +1,36 @@
-"""Helpers for loading A2A optional dependencies."""
+# """Helpers for loading A2A optional dependencies."""
-from __future__ import annotations
+# from __future__ import annotations
-from importlib import import_module
-from types import ModuleType
-from typing import Any
+# from importlib import import_module
+# from types import ModuleType
+# from typing import Any
-A2A_EXTRA_INSTALL_HINT = (
- "Install it with 'pip install 10xscale-agentflow[a2a_sdk]' " "or 'pip install a2a-sdk'."
-)
+# A2A_EXTRA_INSTALL_HINT = (
+# "Install it with 'pip install 10xscale-agentflow[a2a_sdk]' " "or 'pip install a2a-sdk'."
+# )
-def missing_a2a_sdk_error(feature: str, exc: BaseException) -> RuntimeError:
- """Return a consistent error for A2A helpers when a2a-sdk is absent."""
- return RuntimeError(
- f"{feature} requires the optional 'a2a-sdk' package. {A2A_EXTRA_INSTALL_HINT}"
- )
+# def missing_a2a_sdk_error(feature: str, exc: BaseException) -> RuntimeError:
+# """Return a consistent error for A2A helpers when a2a-sdk is absent."""
+# return RuntimeError(
+# f"{feature} requires the optional 'a2a-sdk' package. {A2A_EXTRA_INSTALL_HINT}"
+# )
-def import_a2a_module(module_name: str, feature: str) -> ModuleType:
- """Import an A2A SDK module with a helpful optional-dependency error."""
- try:
- return import_module(module_name)
- except Exception as exc:
- raise missing_a2a_sdk_error(feature, exc) from exc
+# def import_a2a_module(module_name: str, feature: str) -> ModuleType:
+# """Import an A2A SDK module with a helpful optional-dependency error."""
+# try:
+# return import_module(module_name)
+# except Exception as exc:
+# raise missing_a2a_sdk_error(feature, exc) from exc
-def get_a2a_attr(module_name: str, attr_name: str, feature: str) -> Any:
- """Get an attribute from an A2A SDK module with a consistent error."""
- module = import_a2a_module(module_name, feature)
- try:
- return getattr(module, attr_name)
- except AttributeError as exc:
- raise missing_a2a_sdk_error(feature, exc) from exc
+# def get_a2a_attr(module_name: str, attr_name: str, feature: str) -> Any:
+# """Get an attribute from an A2A SDK module with a consistent error."""
+# module = import_a2a_module(module_name, feature)
+# try:
+# return getattr(module, attr_name)
+# except AttributeError as exc:
+# raise missing_a2a_sdk_error(feature, exc) from exc
diff --git a/agentflow/runtime/protocols/a2a/client.py b/agentflow/runtime/protocols/a2a/client.py
index 43daf86..05a0e59 100644
--- a/agentflow/runtime/protocols/a2a/client.py
+++ b/agentflow/runtime/protocols/a2a/client.py
@@ -1,222 +1,222 @@
-"""
-A2A client helpers for agentflow.
+# """
+# A2A client helpers for agentflow.
-Provides utilities to call any remote A2A-compliant agent from within
-an agentflow graph.
+# Provides utilities to call any remote A2A-compliant agent from within
+# an agentflow graph.
-Functions:
- delegate_to_a2a_agent — async one-shot: send text, get text back.
- create_a2a_client_node — factory returning a graph-compatible node
- function that delegates to a remote A2A
- agent.
-"""
+# Functions:
+# delegate_to_a2a_agent — async one-shot: send text, get text back.
+# create_a2a_client_node — factory returning a graph-compatible node
+# function that delegates to a remote A2A
+# agent.
+# """
-from __future__ import annotations
+# from __future__ import annotations
-import logging
-import uuid
-from importlib import import_module
-from typing import Any
-
-from agentflow.core.state.agent_state import AgentState
-from agentflow.core.state.message import Message as AFMessage
+# import logging
+# import uuid
+# from importlib import import_module
+# from typing import Any
+
+# from agentflow.core.state.agent_state import AgentState
+# from agentflow.core.state.message import Message as AFMessage
-from ._optional import A2A_EXTRA_INSTALL_HINT, get_a2a_attr, import_a2a_module
-
-
-logger = logging.getLogger("agentflow.a2a")
-
-
-def _import_client_dependencies():
- """Load client-only optional dependencies when an A2A call is made."""
- feature = "A2A client helpers"
- try:
- httpx = import_module("httpx")
- except Exception as exc:
- raise RuntimeError(
- f"{feature} requires the optional 'httpx' package. {A2A_EXTRA_INSTALL_HINT}"
- ) from exc
+# from ._optional import A2A_EXTRA_INSTALL_HINT, get_a2a_attr, import_a2a_module
+
+
+# logger = logging.getLogger("agentflow.a2a")
+
+
+# def _import_client_dependencies():
+# """Load client-only optional dependencies when an A2A call is made."""
+# feature = "A2A client helpers"
+# try:
+# httpx = import_module("httpx")
+# except Exception as exc:
+# raise RuntimeError(
+# f"{feature} requires the optional 'httpx' package. {A2A_EXTRA_INSTALL_HINT}"
+# ) from exc
- a2a_client = get_a2a_attr("a2a.client", "A2AClient", feature)
- a2a_types = import_a2a_module("a2a.types", feature)
- return httpx, a2a_client, a2a_types
+# a2a_client = get_a2a_attr("a2a.client", "A2AClient", feature)
+# a2a_types = import_a2a_module("a2a.types", feature)
+# return httpx, a2a_client, a2a_types
-# ---------------------------------------------------------------------- #
-# Low-level helper #
-# ---------------------------------------------------------------------- #
-
-
-async def delegate_to_a2a_agent(
- url: str,
- text: str,
- *,
- context_id: str | None = None,
- timeout: float = 30.0,
-) -> str:
- """Call a remote A2A agent and return its text response.
+# # ---------------------------------------------------------------------- #
+# # Low-level helper #
+# # ---------------------------------------------------------------------- #
+
+
+# async def delegate_to_a2a_agent(
+# url: str,
+# text: str,
+# *,
+# context_id: str | None = None,
+# timeout: float = 30.0,
+# ) -> str:
+# """Call a remote A2A agent and return its text response.
- This uses the (deprecated but stable) ``A2AClient`` from the a2a-sdk
- which provides the simplest request/response interface.
+# This uses the (deprecated but stable) ``A2AClient`` from the a2a-sdk
+# which provides the simplest request/response interface.
- Args:
- url: Base URL of the remote agent (e.g. ``http://localhost:9999``).
- text: The user message to send.
- timeout: HTTP request timeout in seconds.
-
- Returns:
- The text content of the agent's response.
-
- Raises:
- RuntimeError: If the agent returns an error or no text parts.
- """
- httpx, a2a_client, a2a_types = _import_client_dependencies()
-
- async with httpx.AsyncClient(timeout=timeout) as http:
- client = a2a_client(httpx_client=http, url=url)
-
- request = a2a_types.SendMessageRequest(
- id=str(uuid.uuid4()),
- params=a2a_types.MessageSendParams(
- message=a2a_types.Message(
- role=a2a_types.Role.user,
- message_id=str(uuid.uuid4()),
- context_id=context_id,
- parts=[a2a_types.TextPart(text=text)],
- ),
- ),
- )
-
- response = await client.send_message(request)
-
- # response.root is either SendMessageSuccessResponse or JSONRPCErrorResponse
- result = response.root
- if hasattr(result, "error"):
- raise RuntimeError(f"A2A agent returned error: {result.error}")
-
- # result.result is Task | Message
- payload = result.result
-
- # Extract text from the response
- return _extract_text(payload)
-
-
-def _extract_text(payload: Any) -> str:
- """Pull text from a Task or Message returned by the A2A SDK.
-
- The SDK wraps parts in ``Part(root=TextPart(...))`` — a discriminated
- union. We check both ``part.text`` (direct TextPart) and
- ``part.root.text`` (wrapped Part) to be resilient.
- """
- parts: list[Any] = []
-
- if hasattr(payload, "parts"):
- # It's an A2A Message
- parts = payload.parts or []
- elif hasattr(payload, "artifacts") and payload.artifacts:
- # It's a Task — text lives in artifact parts
- for artifact in payload.artifacts:
- parts.extend(artifact.parts or [])
- elif hasattr(payload, "status") and payload.status and payload.status.message:
- # Fallback: check status message
- parts = payload.status.message.parts or []
-
- text_parts: list[str] = []
- for p in parts:
- # Direct TextPart (has .text)
- if hasattr(p, "text") and isinstance(p.text, str):
- text_parts.append(p.text)
- # Wrapped Part(root=TextPart(...))
- elif hasattr(p, "root") and hasattr(p.root, "text") and isinstance(p.root.text, str):
- text_parts.append(p.root.text)
-
- if text_parts:
- return "\n".join(text_parts)
-
- raise RuntimeError("A2A agent response contained no text parts")
-
-
-# ---------------------------------------------------------------------- #
-# Graph node factory #
-# ---------------------------------------------------------------------- #
-
-
-def create_a2a_client_node(
- url: str,
- *,
- timeout: float = 30.0,
- response_role: str = "assistant",
-):
- """Return an async callable that can be used as an agentflow graph node.
-
- The node reads the last message from the state, forwards its text to
- the remote A2A agent at *url*, and returns the response as a new
- ``Message``.
-
- Usage::
-
- graph.add_node("remote_agent", create_a2a_client_node("http://localhost:9999"))
- graph.add_edge("some_node", "remote_agent")
- graph.add_edge("remote_agent", END)
-
- Args:
- url: Base URL of the remote A2A agent.
- timeout: HTTP request timeout.
- response_role: Role to assign to the response message
- (default ``"assistant"``).
-
- Returns:
- An async function with signature
- ``(state: AgentState, config: dict) -> list[AFMessage]``
- """
-
- async def _a2a_node(state: AgentState, config: dict) -> list[AFMessage]:
- # Get text from the last message in the conversation
- if not state.context:
- return [
- AFMessage.text_message(
- "No input provided.",
- role=response_role,
- ),
- ]
-
- user_text = state.context[-1].text()
- if not user_text:
- return [
- AFMessage.text_message(
- "Empty input.",
- role=response_role,
- ),
- ]
-
- # Reuse the parent graph's thread_id as context_id so the remote
- # A2A agent stays in the same session as the whole workflow.
- # The server uses context_id as its own thread_id for its checkpointer,
- # so it maintains full conversation history server-side across turns.
- context_id = config.get("thread_id")
-
- try:
- response = await delegate_to_a2a_agent(
- url, user_text, context_id=context_id, timeout=timeout
- )
- except Exception as exc:
- logger.exception("A2A client node failed for url=%s", url)
- return [
- AFMessage.text_message(
- f"A2A call failed: {exc!s}",
- role=response_role,
- ),
- ]
-
- return [
- AFMessage.text_message(
- response,
- role=response_role,
- ),
- ]
-
- # Give the function a useful name for debugging / graph visualization
- _a2a_node.__name__ = f"a2a_client_node({url})"
- _a2a_node.__qualname__ = _a2a_node.__name__
-
- return _a2a_node
+# Args:
+# url: Base URL of the remote agent (e.g. ``http://localhost:9999``).
+# text: The user message to send.
+# timeout: HTTP request timeout in seconds.
+
+# Returns:
+# The text content of the agent's response.
+
+# Raises:
+# RuntimeError: If the agent returns an error or no text parts.
+# """
+# httpx, a2a_client, a2a_types = _import_client_dependencies()
+
+# async with httpx.AsyncClient(timeout=timeout) as http:
+# client = a2a_client(httpx_client=http, url=url)
+
+# request = a2a_types.SendMessageRequest(
+# id=str(uuid.uuid4()),
+# params=a2a_types.MessageSendParams(
+# message=a2a_types.Message(
+# role=a2a_types.Role.user,
+# message_id=str(uuid.uuid4()),
+# context_id=context_id,
+# parts=[a2a_types.TextPart(text=text)],
+# ),
+# ),
+# )
+
+# response = await client.send_message(request)
+
+# # response.root is either SendMessageSuccessResponse or JSONRPCErrorResponse
+# result = response.root
+# if hasattr(result, "error"):
+# raise RuntimeError(f"A2A agent returned error: {result.error}")
+
+# # result.result is Task | Message
+# payload = result.result
+
+# # Extract text from the response
+# return _extract_text(payload)
+
+
+# def _extract_text(payload: Any) -> str:
+# """Pull text from a Task or Message returned by the A2A SDK.
+
+# The SDK wraps parts in ``Part(root=TextPart(...))`` — a discriminated
+# union. We check both ``part.text`` (direct TextPart) and
+# ``part.root.text`` (wrapped Part) to be resilient.
+# """
+# parts: list[Any] = []
+
+# if hasattr(payload, "parts"):
+# # It's an A2A Message
+# parts = payload.parts or []
+# elif hasattr(payload, "artifacts") and payload.artifacts:
+# # It's a Task — text lives in artifact parts
+# for artifact in payload.artifacts:
+# parts.extend(artifact.parts or [])
+# elif hasattr(payload, "status") and payload.status and payload.status.message:
+# # Fallback: check status message
+# parts = payload.status.message.parts or []
+
+# text_parts: list[str] = []
+# for p in parts:
+# # Direct TextPart (has .text)
+# if hasattr(p, "text") and isinstance(p.text, str):
+# text_parts.append(p.text)
+# # Wrapped Part(root=TextPart(...))
+# elif hasattr(p, "root") and hasattr(p.root, "text") and isinstance(p.root.text, str):
+# text_parts.append(p.root.text)
+
+# if text_parts:
+# return "\n".join(text_parts)
+
+# raise RuntimeError("A2A agent response contained no text parts")
+
+
+# # ---------------------------------------------------------------------- #
+# # Graph node factory #
+# # ---------------------------------------------------------------------- #
+
+
+# def create_a2a_client_node(
+# url: str,
+# *,
+# timeout: float = 30.0,
+# response_role: str = "assistant",
+# ):
+# """Return an async callable that can be used as an agentflow graph node.
+
+# The node reads the last message from the state, forwards its text to
+# the remote A2A agent at *url*, and returns the response as a new
+# ``Message``.
+
+# Usage::
+
+# graph.add_node("remote_agent", create_a2a_client_node("http://localhost:9999"))
+# graph.add_edge("some_node", "remote_agent")
+# graph.add_edge("remote_agent", END)
+
+# Args:
+# url: Base URL of the remote A2A agent.
+# timeout: HTTP request timeout.
+# response_role: Role to assign to the response message
+# (default ``"assistant"``).
+
+# Returns:
+# An async function with signature
+# ``(state: AgentState, config: dict) -> list[AFMessage]``
+# """
+
+# async def _a2a_node(state: AgentState, config: dict) -> list[AFMessage]:
+# # Get text from the last message in the conversation
+# if not state.context:
+# return [
+# AFMessage.text_message(
+# "No input provided.",
+# role=response_role,
+# ),
+# ]
+
+# user_text = state.context[-1].text()
+# if not user_text:
+# return [
+# AFMessage.text_message(
+# "Empty input.",
+# role=response_role,
+# ),
+# ]
+
+# # Reuse the parent graph's thread_id as context_id so the remote
+# # A2A agent stays in the same session as the whole workflow.
+# # The server uses context_id as its own thread_id for its checkpointer,
+# # so it maintains full conversation history server-side across turns.
+# context_id = config.get("thread_id")
+
+# try:
+# response = await delegate_to_a2a_agent(
+# url, user_text, context_id=context_id, timeout=timeout
+# )
+# except Exception as exc:
+# logger.exception("A2A client node failed for url=%s", url)
+# return [
+# AFMessage.text_message(
+# f"A2A call failed: {exc!s}",
+# role=response_role,
+# ),
+# ]
+
+# return [
+# AFMessage.text_message(
+# response,
+# role=response_role,
+# ),
+# ]
+
+# # Give the function a useful name for debugging / graph visualization
+# _a2a_node.__name__ = f"a2a_client_node({url})"
+# _a2a_node.__qualname__ = _a2a_node.__name__
+
+# return _a2a_node
diff --git a/agentflow/runtime/protocols/a2a/executor.py b/agentflow/runtime/protocols/a2a/executor.py
index e138543..41f9b7d 100644
--- a/agentflow/runtime/protocols/a2a/executor.py
+++ b/agentflow/runtime/protocols/a2a/executor.py
@@ -1,224 +1,224 @@
-"""
-AgentFlowExecutor — the sole bridge between agentflow and the a2a-sdk.
+# """
+# AgentFlowExecutor — the sole bridge between agentflow and the a2a-sdk.
-This module implements the ``AgentExecutor`` interface from the official
-``a2a-sdk`` so that **any** agentflow ``CompiledGraph`` can be served as
-a standard A2A agent. The SDK handles all HTTP, JSON-RPC, SSE, and task
-lifecycle concerns. Agentflow handles all agent logic.
-
-Blocking path (default):
- Uses ``CompiledGraph.ainvoke`` to run the graph to completion, then
- emits a single ``TextPart`` artifact with the last assistant message.
-
-Streaming path (``streaming=True``):
- Uses ``CompiledGraph.astream`` to yield incremental ``StreamChunk``
- objects. For each chunk that carries a message, the executor sends a
- ``TaskState.working`` status update so the A2A client can observe
- progress. After the stream finishes the final text is emitted as an
- artifact.
-"""
-
-from __future__ import annotations
-
-import logging
-from typing import TYPE_CHECKING, Any
-
-from agentflow.core.state.message import Message as AFMessage
-from agentflow.core.state.stream_chunks import StreamEvent
-from agentflow.utils.constants import ResponseGranularity
-
-from ._optional import missing_a2a_sdk_error
-
-
-if TYPE_CHECKING:
- from agentflow.core.graph.compiled_graph import CompiledGraph
-
-
-try:
- from a2a.server.agent_execution import AgentExecutor as _AgentExecutor
- from a2a.server.agent_execution.context import RequestContext
- from a2a.server.events.event_queue import EventQueue
- from a2a.server.tasks.task_updater import TaskUpdater
- from a2a.types import TaskState, TextPart
-except Exception as exc:
- _A2A_IMPORT_ERROR: BaseException | None = exc
-
- class _AgentExecutor:
- """Fallback base class used when a2a-sdk is not installed."""
-
-else:
- _A2A_IMPORT_ERROR = None
-
-AgentExecutor = _AgentExecutor
-
-
-logger = logging.getLogger("agentflow.a2a")
-
-
-class AgentFlowExecutor(AgentExecutor):
- """Bridges a :class:`CompiledGraph` into the A2A execution model.
-
- This is the **only** glue code needed between agentflow and a2a-sdk.
- The SDK owns the transport; agentflow owns the agent logic.
-
- Args:
- compiled_graph: A fully compiled agentflow graph.
- config: Optional base config dict forwarded to ``ainvoke`` /
- ``astream`` (e.g. ``thread_id``, ``recursion_limit``).
- streaming: When *True* the executor uses ``astream`` instead of
- ``ainvoke`` and sends ``TaskState.working`` progress events.
- """
-
- def __init__(
- self,
- compiled_graph: CompiledGraph,
- config: dict[str, Any] | None = None,
- streaming: bool = False,
- ) -> None:
- if _A2A_IMPORT_ERROR is not None:
- raise missing_a2a_sdk_error("AgentFlowExecutor", _A2A_IMPORT_ERROR) from (
- _A2A_IMPORT_ERROR
- )
-
- self.graph = compiled_graph
- self._base_config = config or {}
- self._streaming = streaming
-
- # ------------------------------------------------------------------ #
- # A2A AgentExecutor interface #
- # ------------------------------------------------------------------ #
-
- async def execute(
- self,
- context: RequestContext,
- event_queue: EventQueue,
- ) -> None:
- """Run the agentflow graph for an incoming A2A request.
-
- 1. Extract user text from the A2A message parts.
- 2. Run the graph (blocking or streaming).
- 3. Push the result back as an A2A artifact.
- """
- updater = TaskUpdater(
- event_queue=event_queue,
- task_id=context.task_id or "",
- context_id=context.context_id or "",
- )
- await updater.submit()
- await updater.start_work()
-
- try:
- # --- extract user text from A2A message parts ----------------
- user_text = context.get_user_input() if context.message else ""
-
- # build agentflow messages list
- messages = [AFMessage.text_message(user_text, role="user")]
-
- # per-request config — use context_id as thread_id so that
- # conversation history persists across A2A tasks within the
- # same session. Falls back to task_id for one-shot callers.
- run_config: dict[str, Any] = {**self._base_config}
- if "thread_id" not in run_config:
- run_config["thread_id"] = context.context_id or context.task_id or ""
-
- if self._streaming:
- response_text = await self._execute_streaming(messages, run_config, updater)
- else:
- response_text = await self._execute_blocking(messages, run_config)
-
- # --- emit the final artifact ---------------------------------
- await updater.add_artifact([TextPart(text=response_text)])
- await updater.complete()
-
- except Exception as exc:
- logger.exception("AgentFlowExecutor: graph execution failed")
- error_msg = updater.new_agent_message(parts=[TextPart(text=f"Error: {exc!s}")])
- await updater.failed(message=error_msg)
-
- async def cancel(
- self,
- context: RequestContext,
- event_queue: EventQueue,
- ) -> None:
- """Cancel is not currently supported by agentflow graphs."""
- raise NotImplementedError("cancel not supported")
-
- # ------------------------------------------------------------------ #
- # Internal helpers #
- # ------------------------------------------------------------------ #
-
- async def _execute_blocking(
- self,
- messages: list[AFMessage],
- config: dict[str, Any],
- ) -> str:
- """Run the graph via ``ainvoke`` and return the last assistant text."""
- result = await self.graph.ainvoke(
- {"messages": messages},
- config=config,
- response_granularity=ResponseGranularity.FULL,
- )
- return self._extract_response_text(result)
-
- async def _execute_streaming(
- self,
- messages: list[AFMessage],
- config: dict[str, Any],
- updater: TaskUpdater,
- ) -> str:
- """Run the graph via ``astream``, sending progress updates per chunk."""
- last_text = ""
- async for chunk in self.graph.astream(
- {"messages": messages},
- config=config,
- response_granularity=ResponseGranularity.FULL,
- ):
- if chunk.event == StreamEvent.MESSAGE and chunk.message is not None:
- text = chunk.message.text()
- if text:
- last_text = text
- # signal progress with the latest text
- progress_msg = updater.new_agent_message(parts=[TextPart(text=text)])
- await updater.update_status(TaskState.working, message=progress_msg)
- elif chunk.event == StreamEvent.STATE and chunk.state is not None:
- # final state arrived — extract from it
- assistant_text = self._extract_state_text(chunk.state)
- if assistant_text:
- last_text = assistant_text
-
- return last_text or "No response generated."
-
- # ------------------------------------------------------------------ #
- # Text extraction #
- # ------------------------------------------------------------------ #
-
- @staticmethod
- def _extract_response_text(result: dict[str, Any]) -> str:
- """Pull the last assistant message text from an ``ainvoke`` result.
-
- With ``ResponseGranularity.FULL`` the result dict contains
- ``"state"`` (the complete ``AgentState``) as well as
- ``"messages"`` (last-step messages). We prefer the full state
- because it has all messages across every node.
- """
- # Primary: full state context
- full_state = result.get("state")
- if full_state is not None:
- for msg in reversed(full_state.context):
- if msg.role == "assistant":
- return msg.text() or ""
-
- # Fallback: messages list at LOW/PARTIAL granularity
- for msg in reversed(result.get("messages", [])):
- if msg.role == "assistant":
- return msg.text() or ""
-
- return "No response generated."
-
- @staticmethod
- def _extract_state_text(state: Any) -> str:
- """Extract last assistant text from an ``AgentState``."""
- for msg in reversed(state.context):
- if msg.role == "assistant":
- return msg.text() or ""
- return ""
+# This module implements the ``AgentExecutor`` interface from the official
+# ``a2a-sdk`` so that **any** agentflow ``CompiledGraph`` can be served as
+# a standard A2A agent. The SDK handles all HTTP, JSON-RPC, SSE, and task
+# lifecycle concerns. Agentflow handles all agent logic.
+
+# Blocking path (default):
+# Uses ``CompiledGraph.ainvoke`` to run the graph to completion, then
+# emits a single ``TextPart`` artifact with the last assistant message.
+
+# Streaming path (``streaming=True``):
+# Uses ``CompiledGraph.astream`` to yield incremental ``StreamChunk``
+# objects. For each chunk that carries a message, the executor sends a
+# ``TaskState.working`` status update so the A2A client can observe
+# progress. After the stream finishes the final text is emitted as an
+# artifact.
+# """
+
+# from __future__ import annotations
+
+# import logging
+# from typing import TYPE_CHECKING, Any
+
+# from agentflow.core.state.message import Message as AFMessage
+# from agentflow.core.state.stream_chunks import StreamEvent
+# from agentflow.utils.constants import ResponseGranularity
+
+# from ._optional import missing_a2a_sdk_error
+
+
+# if TYPE_CHECKING:
+# from agentflow.core.graph.compiled_graph import CompiledGraph
+
+
+# try:
+# from a2a.server.agent_execution import AgentExecutor as _AgentExecutor
+# from a2a.server.agent_execution.context import RequestContext
+# from a2a.server.events.event_queue import EventQueue
+# from a2a.server.tasks.task_updater import TaskUpdater
+# from a2a.types import TaskState, TextPart
+# except Exception as exc:
+# _A2A_IMPORT_ERROR: BaseException | None = exc
+
+# class _AgentExecutor:
+# """Fallback base class used when a2a-sdk is not installed."""
+
+# else:
+# _A2A_IMPORT_ERROR = None
+
+# AgentExecutor = _AgentExecutor
+
+
+# logger = logging.getLogger("agentflow.a2a")
+
+
+# class AgentFlowExecutor(AgentExecutor):
+# """Bridges a :class:`CompiledGraph` into the A2A execution model.
+
+# This is the **only** glue code needed between agentflow and a2a-sdk.
+# The SDK owns the transport; agentflow owns the agent logic.
+
+# Args:
+# compiled_graph: A fully compiled agentflow graph.
+# config: Optional base config dict forwarded to ``ainvoke`` /
+# ``astream`` (e.g. ``thread_id``, ``recursion_limit``).
+# streaming: When *True* the executor uses ``astream`` instead of
+# ``ainvoke`` and sends ``TaskState.working`` progress events.
+# """
+
+# def __init__(
+# self,
+# compiled_graph: CompiledGraph,
+# config: dict[str, Any] | None = None,
+# streaming: bool = False,
+# ) -> None:
+# if _A2A_IMPORT_ERROR is not None:
+# raise missing_a2a_sdk_error("AgentFlowExecutor", _A2A_IMPORT_ERROR) from (
+# _A2A_IMPORT_ERROR
+# )
+
+# self.graph = compiled_graph
+# self._base_config = config or {}
+# self._streaming = streaming
+
+# # ------------------------------------------------------------------ #
+# # A2A AgentExecutor interface #
+# # ------------------------------------------------------------------ #
+
+# async def execute(
+# self,
+# context: RequestContext,
+# event_queue: EventQueue,
+# ) -> None:
+# """Run the agentflow graph for an incoming A2A request.
+
+# 1. Extract user text from the A2A message parts.
+# 2. Run the graph (blocking or streaming).
+# 3. Push the result back as an A2A artifact.
+# """
+# updater = TaskUpdater(
+# event_queue=event_queue,
+# task_id=context.task_id or "",
+# context_id=context.context_id or "",
+# )
+# await updater.submit()
+# await updater.start_work()
+
+# try:
+# # --- extract user text from A2A message parts ----------------
+# user_text = context.get_user_input() if context.message else ""
+
+# # build agentflow messages list
+# messages = [AFMessage.text_message(user_text, role="user")]
+
+# # per-request config — use context_id as thread_id so that
+# # conversation history persists across A2A tasks within the
+# # same session. Falls back to task_id for one-shot callers.
+# run_config: dict[str, Any] = {**self._base_config}
+# if "thread_id" not in run_config:
+# run_config["thread_id"] = context.context_id or context.task_id or ""
+
+# if self._streaming:
+# response_text = await self._execute_streaming(messages, run_config, updater)
+# else:
+# response_text = await self._execute_blocking(messages, run_config)
+
+# # --- emit the final artifact ---------------------------------
+# await updater.add_artifact([TextPart(text=response_text)])
+# await updater.complete()
+
+# except Exception as exc:
+# logger.exception("AgentFlowExecutor: graph execution failed")
+# error_msg = updater.new_agent_message(parts=[TextPart(text=f"Error: {exc!s}")])
+# await updater.failed(message=error_msg)
+
+# async def cancel(
+# self,
+# context: RequestContext,
+# event_queue: EventQueue,
+# ) -> None:
+# """Cancel is not currently supported by agentflow graphs."""
+# raise NotImplementedError("cancel not supported")
+
+# # ------------------------------------------------------------------ #
+# # Internal helpers #
+# # ------------------------------------------------------------------ #
+
+# async def _execute_blocking(
+# self,
+# messages: list[AFMessage],
+# config: dict[str, Any],
+# ) -> str:
+# """Run the graph via ``ainvoke`` and return the last assistant text."""
+# result = await self.graph.ainvoke(
+# {"messages": messages},
+# config=config,
+# response_granularity=ResponseGranularity.FULL,
+# )
+# return self._extract_response_text(result)
+
+# async def _execute_streaming(
+# self,
+# messages: list[AFMessage],
+# config: dict[str, Any],
+# updater: TaskUpdater,
+# ) -> str:
+# """Run the graph via ``astream``, sending progress updates per chunk."""
+# last_text = ""
+# async for chunk in self.graph.astream(
+# {"messages": messages},
+# config=config,
+# response_granularity=ResponseGranularity.FULL,
+# ):
+# if chunk.event == StreamEvent.MESSAGE and chunk.message is not None:
+# text = chunk.message.text()
+# if text:
+# last_text = text
+# # signal progress with the latest text
+# progress_msg = updater.new_agent_message(parts=[TextPart(text=text)])
+# await updater.update_status(TaskState.working, message=progress_msg)
+# elif chunk.event == StreamEvent.STATE and chunk.state is not None:
+# # final state arrived — extract from it
+# assistant_text = self._extract_state_text(chunk.state)
+# if assistant_text:
+# last_text = assistant_text
+
+# return last_text or "No response generated."
+
+# # ------------------------------------------------------------------ #
+# # Text extraction #
+# # ------------------------------------------------------------------ #
+
+# @staticmethod
+# def _extract_response_text(result: dict[str, Any]) -> str:
+# """Pull the last assistant message text from an ``ainvoke`` result.
+
+# With ``ResponseGranularity.FULL`` the result dict contains
+# ``"state"`` (the complete ``AgentState``) as well as
+# ``"messages"`` (last-step messages). We prefer the full state
+# because it has all messages across every node.
+# """
+# # Primary: full state context
+# full_state = result.get("state")
+# if full_state is not None:
+# for msg in reversed(full_state.context):
+# if msg.role == "assistant":
+# return msg.text() or ""
+
+# # Fallback: messages list at LOW/PARTIAL granularity
+# for msg in reversed(result.get("messages", [])):
+# if msg.role == "assistant":
+# return msg.text() or ""
+
+# return "No response generated."
+
+# @staticmethod
+# def _extract_state_text(state: Any) -> str:
+# """Extract last assistant text from an ``AgentState``."""
+# for msg in reversed(state.context):
+# if msg.role == "assistant":
+# return msg.text() or ""
+# return ""
diff --git a/agentflow/runtime/protocols/a2a/server.py b/agentflow/runtime/protocols/a2a/server.py
index 5836e32..477a33f 100644
--- a/agentflow/runtime/protocols/a2a/server.py
+++ b/agentflow/runtime/protocols/a2a/server.py
@@ -1,172 +1,172 @@
-"""
-A2A server helpers for agentflow.
-
-Provides convenience functions to expose a :class:`CompiledGraph` as an
-A2A-compliant HTTP endpoint using the official ``a2a-sdk``.
-
-Functions:
- create_a2a_server — one-call to start a uvicorn server.
- build_a2a_app — returns a Starlette ASGI app (composable).
- make_agent_card — builds an ``AgentCard`` with sensible defaults.
-"""
-
-from __future__ import annotations
-
-from typing import TYPE_CHECKING, Any
-
-from ._optional import A2A_EXTRA_INSTALL_HINT, import_a2a_module
-
-
-if TYPE_CHECKING:
- from a2a.types import AgentCard, AgentSkill
- from starlette.applications import Starlette
-
- from agentflow.core.graph.compiled_graph import CompiledGraph
-
-
-# ---------------------------------------------------------------------- #
-# AgentCard helper #
-# ---------------------------------------------------------------------- #
-
-
-def make_agent_card(
- name: str,
- description: str,
- url: str,
- *,
- skills: list[AgentSkill] | None = None,
- streaming: bool = False,
- version: str = "1.0.0",
-) -> AgentCard:
- """Build an :class:`AgentCard` with sensible defaults.
-
- If *skills* is ``None`` a single ``"run_graph"`` skill is created
- automatically.
-
- Args:
- name: Human-readable agent name.
- description: Short description of what the agent does.
- url: Public URL where the agent is reachable.
- skills: Optional list of ``AgentSkill`` objects.
- streaming: Whether the agent supports SSE streaming.
- version: Semantic version string.
-
- Returns:
- A fully populated ``AgentCard``.
- """
- a2a_types = import_a2a_module("a2a.types", "make_agent_card")
-
- if skills is None:
- skills = [
- a2a_types.AgentSkill(
- id="run_graph",
- name="Run Graph",
- description="Execute the agentflow graph",
- tags=["agentflow"],
- )
- ]
-
- return a2a_types.AgentCard(
- name=name,
- description=description,
- url=url,
- version=version,
- capabilities=a2a_types.AgentCapabilities(streaming=streaming),
- default_input_modes=["text"],
- default_output_modes=["text"],
- skills=skills,
- )
-
-
-# ---------------------------------------------------------------------- #
-# ASGI app builder #
-# ---------------------------------------------------------------------- #
-
-
-def build_a2a_app(
- compiled_graph: CompiledGraph,
- agent_card: AgentCard,
- *,
- streaming: bool = False,
- executor_config: dict[str, Any] | None = None,
-) -> Starlette:
- """Return a Starlette ASGI app that speaks the A2A protocol.
-
- Useful when you want to mount the app inside another ASGI framework
- (e.g. FastAPI), run it with a custom server, or use it in tests.
-
- Args:
- compiled_graph: A compiled agentflow graph.
- agent_card: The ``AgentCard`` describing this agent.
- streaming: Whether to use ``astream`` vs ``ainvoke`` in the
- executor.
- executor_config: Optional base config forwarded to the graph
- (e.g. ``{"recursion_limit": 50}``).
-
- Returns:
- A ``Starlette`` application ready to be served.
- """
- a2a_apps = import_a2a_module("a2a.server.apps", "build_a2a_app")
- request_handlers = import_a2a_module("a2a.server.request_handlers", "build_a2a_app")
- tasks = import_a2a_module("a2a.server.tasks", "build_a2a_app")
-
- from .executor import AgentFlowExecutor
-
- executor = AgentFlowExecutor(
- compiled_graph,
- config=executor_config,
- streaming=streaming,
- )
- handler = request_handlers.DefaultRequestHandler(
- agent_executor=executor,
- task_store=tasks.InMemoryTaskStore(),
- )
- a2a_app = a2a_apps.A2AStarletteApplication(
- agent_card=agent_card,
- http_handler=handler,
- )
- return a2a_app.build()
-
-
-# ---------------------------------------------------------------------- #
-# One-call server #
-# ---------------------------------------------------------------------- #
-
-
-def create_a2a_server(
- compiled_graph: CompiledGraph,
- agent_card: AgentCard,
- *,
- host: str = "127.0.0.1",
- port: int = 9999,
- streaming: bool = False,
- executor_config: dict[str, Any] | None = None,
-) -> None:
- """Build and run an A2A server exposing the given graph.
-
- This is a blocking call — it starts uvicorn and does not return until
- the server is shut down.
-
- Args:
- compiled_graph: A compiled agentflow graph.
- agent_card: The ``AgentCard`` describing this agent.
- host: Bind address.
- port: Bind port.
- streaming: Whether to use ``astream`` in the executor.
- executor_config: Optional base config forwarded to the graph.
- """
- try:
- uvicorn = __import__("uvicorn")
- except Exception as exc:
- raise RuntimeError(
- "create_a2a_server requires the optional 'uvicorn' package. "
- f"{A2A_EXTRA_INSTALL_HINT}"
- ) from exc
-
- app = build_a2a_app(
- compiled_graph,
- agent_card,
- streaming=streaming,
- executor_config=executor_config,
- )
- uvicorn.run(app, host=host, port=port)
+# """
+# A2A server helpers for agentflow.
+
+# Provides convenience functions to expose a :class:`CompiledGraph` as an
+# A2A-compliant HTTP endpoint using the official ``a2a-sdk``.
+
+# Functions:
+# create_a2a_server — one-call to start a uvicorn server.
+# build_a2a_app — returns a Starlette ASGI app (composable).
+# make_agent_card — builds an ``AgentCard`` with sensible defaults.
+# """
+
+# from __future__ import annotations
+
+# from typing import TYPE_CHECKING, Any
+
+# from ._optional import A2A_EXTRA_INSTALL_HINT, import_a2a_module
+
+
+# if TYPE_CHECKING:
+# from a2a.types import AgentCard, AgentSkill
+# from starlette.applications import Starlette
+
+# from agentflow.core.graph.compiled_graph import CompiledGraph
+
+
+# # ---------------------------------------------------------------------- #
+# # AgentCard helper #
+# # ---------------------------------------------------------------------- #
+
+
+# def make_agent_card(
+# name: str,
+# description: str,
+# url: str,
+# *,
+# skills: list[AgentSkill] | None = None,
+# streaming: bool = False,
+# version: str = "1.0.0",
+# ) -> AgentCard:
+# """Build an :class:`AgentCard` with sensible defaults.
+
+# If *skills* is ``None`` a single ``"run_graph"`` skill is created
+# automatically.
+
+# Args:
+# name: Human-readable agent name.
+# description: Short description of what the agent does.
+# url: Public URL where the agent is reachable.
+# skills: Optional list of ``AgentSkill`` objects.
+# streaming: Whether the agent supports SSE streaming.
+# version: Semantic version string.
+
+# Returns:
+# A fully populated ``AgentCard``.
+# """
+# a2a_types = import_a2a_module("a2a.types", "make_agent_card")
+
+# if skills is None:
+# skills = [
+# a2a_types.AgentSkill(
+# id="run_graph",
+# name="Run Graph",
+# description="Execute the agentflow graph",
+# tags=["agentflow"],
+# )
+# ]
+
+# return a2a_types.AgentCard(
+# name=name,
+# description=description,
+# url=url,
+# version=version,
+# capabilities=a2a_types.AgentCapabilities(streaming=streaming),
+# default_input_modes=["text"],
+# default_output_modes=["text"],
+# skills=skills,
+# )
+
+
+# # ---------------------------------------------------------------------- #
+# # ASGI app builder #
+# # ---------------------------------------------------------------------- #
+
+
+# def build_a2a_app(
+# compiled_graph: CompiledGraph,
+# agent_card: AgentCard,
+# *,
+# streaming: bool = False,
+# executor_config: dict[str, Any] | None = None,
+# ) -> Starlette:
+# """Return a Starlette ASGI app that speaks the A2A protocol.
+
+# Useful when you want to mount the app inside another ASGI framework
+# (e.g. FastAPI), run it with a custom server, or use it in tests.
+
+# Args:
+# compiled_graph: A compiled agentflow graph.
+# agent_card: The ``AgentCard`` describing this agent.
+# streaming: Whether to use ``astream`` vs ``ainvoke`` in the
+# executor.
+# executor_config: Optional base config forwarded to the graph
+# (e.g. ``{"recursion_limit": 50}``).
+
+# Returns:
+# A ``Starlette`` application ready to be served.
+# """
+# a2a_apps = import_a2a_module("a2a.server.apps", "build_a2a_app")
+# request_handlers = import_a2a_module("a2a.server.request_handlers", "build_a2a_app")
+# tasks = import_a2a_module("a2a.server.tasks", "build_a2a_app")
+
+# from .executor import AgentFlowExecutor
+
+# executor = AgentFlowExecutor(
+# compiled_graph,
+# config=executor_config,
+# streaming=streaming,
+# )
+# handler = request_handlers.DefaultRequestHandler(
+# agent_executor=executor,
+# task_store=tasks.InMemoryTaskStore(),
+# )
+# a2a_app = a2a_apps.A2AStarletteApplication(
+# agent_card=agent_card,
+# http_handler=handler,
+# )
+# return a2a_app.build()
+
+
+# # ---------------------------------------------------------------------- #
+# # One-call server #
+# # ---------------------------------------------------------------------- #
+
+
+# def create_a2a_server(
+# compiled_graph: CompiledGraph,
+# agent_card: AgentCard,
+# *,
+# host: str = "127.0.0.1",
+# port: int = 9999,
+# streaming: bool = False,
+# executor_config: dict[str, Any] | None = None,
+# ) -> None:
+# """Build and run an A2A server exposing the given graph.
+
+# This is a blocking call — it starts uvicorn and does not return until
+# the server is shut down.
+
+# Args:
+# compiled_graph: A compiled agentflow graph.
+# agent_card: The ``AgentCard`` describing this agent.
+# host: Bind address.
+# port: Bind port.
+# streaming: Whether to use ``astream`` in the executor.
+# executor_config: Optional base config forwarded to the graph.
+# """
+# try:
+# uvicorn = __import__("uvicorn")
+# except Exception as exc:
+# raise RuntimeError(
+# "create_a2a_server requires the optional 'uvicorn' package. "
+# f"{A2A_EXTRA_INSTALL_HINT}"
+# ) from exc
+
+# app = build_a2a_app(
+# compiled_graph,
+# agent_card,
+# streaming=streaming,
+# executor_config=executor_config,
+# )
+# uvicorn.run(app, host=host, port=port)
diff --git a/eval_reports/s-file_20260610_231346.html b/eval_reports/s-file_20260610_231346.html
new file mode 100644
index 0000000..eb1816c
--- /dev/null
+++ b/eval_reports/s-file_20260610_231346.html
@@ -0,0 +1,954 @@
+
+
+
+
+
+ s-file
+
+
+
+
+
+
+
+
+
+
+
+
+
+
📈
+
100%
+
Pass Rate
+
+
+
+
⏱️
+
0.00s
+
Duration
+
+
+
+
+
+
+
📊 Criterion Breakdown
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/eval_reports/s-file_20260610_231346.json b/eval_reports/s-file_20260610_231346.json
new file mode 100644
index 0000000..bd48aaa
--- /dev/null
+++ b/eval_reports/s-file_20260610_231346.json
@@ -0,0 +1,123 @@
+{
+ "eval_set_id": "s-file",
+ "eval_set_name": "",
+ "results": [
+ {
+ "eval_id": "c1",
+ "name": "",
+ "passed": true,
+ "criterion_results": [],
+ "actual_trajectory": [],
+ "actual_tool_calls": [],
+ "actual_response": "",
+ "messages": [],
+ "node_responses": [],
+ "node_visits": [],
+ "duration_seconds": 0.0,
+ "error": null,
+ "metadata": {},
+ "turn_results": [],
+ "token_usage": {
+ "input_tokens": 0,
+ "output_tokens": 0,
+ "cache_read_tokens": 0,
+ "cache_creation_tokens": 0,
+ "total_tokens": 0
+ },
+ "agent_token_usage": {
+ "input_tokens": 0,
+ "output_tokens": 0,
+ "cache_read_tokens": 0,
+ "cache_creation_tokens": 0,
+ "total_tokens": 0
+ },
+ "node_details": []
+ }
+ ],
+ "summary": {
+ "total_cases": 1,
+ "passed_cases": 1,
+ "failed_cases": 0,
+ "error_cases": 0,
+ "pass_rate": 1.0,
+ "avg_duration_seconds": 0.0,
+ "total_duration_seconds": 0.0,
+ "criterion_stats": {},
+ "total_token_usage": {
+ "input_tokens": 0,
+ "output_tokens": 0,
+ "cache_read_tokens": 0,
+ "cache_creation_tokens": 0,
+ "total_tokens": 0
+ },
+ "per_case_token_usage": {
+ "c1": {
+ "input_tokens": 0,
+ "output_tokens": 0,
+ "cache_read_tokens": 0,
+ "cache_creation_tokens": 0,
+ "total_tokens": 0
+ }
+ },
+ "avg_tokens_per_case": 0.0
+ },
+ "config_used": {
+ "criteria": {
+ "tool_name_match": null,
+ "trajectory": {
+ "threshold": 1.0,
+ "match_type": "EXACT",
+ "judge_model": "gemini-2.5-flash",
+ "num_samples": 3,
+ "rubrics": [],
+ "keywords": [],
+ "check_args": false,
+ "enabled": true,
+ "api_style": "responses"
+ },
+ "node_order": null,
+ "response_match": {
+ "threshold": 0.8,
+ "match_type": "EXACT",
+ "judge_model": "gemini-2.5-flash",
+ "num_samples": 3,
+ "rubrics": [],
+ "keywords": [],
+ "check_args": false,
+ "enabled": true,
+ "api_style": "responses"
+ },
+ "rouge_match": null,
+ "contains_keywords": null,
+ "llm_judge": null,
+ "rubric_based": null,
+ "factual_accuracy": null,
+ "hallucination": null,
+ "safety": null,
+ "simulation_goals": null
+ },
+ "user_simulator_config": null,
+ "parallel": false,
+ "max_concurrency": 4,
+ "timeout": 300.0,
+ "verbose": false,
+ "mock_mode": false,
+ "reporter": {
+ "enabled": true,
+ "output_dir": "eval_reports",
+ "console": true,
+ "json_report": true,
+ "html": true,
+ "junit_xml": false,
+ "verbose": true,
+ "include_details": true,
+ "include_trajectory": true,
+ "include_node_responses": true,
+ "include_actual_response": true,
+ "include_tool_call_details": true,
+ "timestamp_files": true
+ }
+ },
+ "timestamp": 1781111626.7762134,
+ "metadata": {}
+}
\ No newline at end of file
diff --git a/eval_reports/s-file_20260610_232729.html b/eval_reports/s-file_20260610_232729.html
new file mode 100644
index 0000000..f8d0069
--- /dev/null
+++ b/eval_reports/s-file_20260610_232729.html
@@ -0,0 +1,954 @@
+
+
+
+
+
+ s-file
+
+
+
+
+
+
+
+
+
+
+
+
+
+
📈
+
100%
+
Pass Rate
+
+
+
+
⏱️
+
0.00s
+
Duration
+
+
+
+
+
+
+
📊 Criterion Breakdown
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/eval_reports/s-file_20260610_232729.json b/eval_reports/s-file_20260610_232729.json
new file mode 100644
index 0000000..e4756f6
--- /dev/null
+++ b/eval_reports/s-file_20260610_232729.json
@@ -0,0 +1,123 @@
+{
+ "eval_set_id": "s-file",
+ "eval_set_name": "",
+ "results": [
+ {
+ "eval_id": "c1",
+ "name": "",
+ "passed": true,
+ "criterion_results": [],
+ "actual_trajectory": [],
+ "actual_tool_calls": [],
+ "actual_response": "",
+ "messages": [],
+ "node_responses": [],
+ "node_visits": [],
+ "duration_seconds": 0.0,
+ "error": null,
+ "metadata": {},
+ "turn_results": [],
+ "token_usage": {
+ "input_tokens": 0,
+ "output_tokens": 0,
+ "cache_read_tokens": 0,
+ "cache_creation_tokens": 0,
+ "total_tokens": 0
+ },
+ "agent_token_usage": {
+ "input_tokens": 0,
+ "output_tokens": 0,
+ "cache_read_tokens": 0,
+ "cache_creation_tokens": 0,
+ "total_tokens": 0
+ },
+ "node_details": []
+ }
+ ],
+ "summary": {
+ "total_cases": 1,
+ "passed_cases": 1,
+ "failed_cases": 0,
+ "error_cases": 0,
+ "pass_rate": 1.0,
+ "avg_duration_seconds": 0.0,
+ "total_duration_seconds": 0.0,
+ "criterion_stats": {},
+ "total_token_usage": {
+ "input_tokens": 0,
+ "output_tokens": 0,
+ "cache_read_tokens": 0,
+ "cache_creation_tokens": 0,
+ "total_tokens": 0
+ },
+ "per_case_token_usage": {
+ "c1": {
+ "input_tokens": 0,
+ "output_tokens": 0,
+ "cache_read_tokens": 0,
+ "cache_creation_tokens": 0,
+ "total_tokens": 0
+ }
+ },
+ "avg_tokens_per_case": 0.0
+ },
+ "config_used": {
+ "criteria": {
+ "tool_name_match": null,
+ "trajectory": {
+ "threshold": 1.0,
+ "match_type": "EXACT",
+ "judge_model": "gemini-2.5-flash",
+ "num_samples": 3,
+ "rubrics": [],
+ "keywords": [],
+ "check_args": false,
+ "enabled": true,
+ "api_style": "responses"
+ },
+ "node_order": null,
+ "response_match": {
+ "threshold": 0.8,
+ "match_type": "EXACT",
+ "judge_model": "gemini-2.5-flash",
+ "num_samples": 3,
+ "rubrics": [],
+ "keywords": [],
+ "check_args": false,
+ "enabled": true,
+ "api_style": "responses"
+ },
+ "rouge_match": null,
+ "contains_keywords": null,
+ "llm_judge": null,
+ "rubric_based": null,
+ "factual_accuracy": null,
+ "hallucination": null,
+ "safety": null,
+ "simulation_goals": null
+ },
+ "user_simulator_config": null,
+ "parallel": false,
+ "max_concurrency": 4,
+ "timeout": 300.0,
+ "verbose": false,
+ "mock_mode": false,
+ "reporter": {
+ "enabled": true,
+ "output_dir": "eval_reports",
+ "console": true,
+ "json_report": true,
+ "html": true,
+ "junit_xml": false,
+ "verbose": true,
+ "include_details": true,
+ "include_trajectory": true,
+ "include_node_responses": true,
+ "include_actual_response": true,
+ "include_tool_call_details": true,
+ "timestamp_files": true
+ }
+ },
+ "timestamp": 1781112449.644461,
+ "metadata": {}
+}
\ No newline at end of file
diff --git a/eval_reports/s-file_20260610_233004.html b/eval_reports/s-file_20260610_233004.html
new file mode 100644
index 0000000..f099f4a
--- /dev/null
+++ b/eval_reports/s-file_20260610_233004.html
@@ -0,0 +1,954 @@
+
+
+
+
+
+ s-file
+
+
+
+
+
+
+
+
+
+
+
+
+
+
📈
+
100%
+
Pass Rate
+
+
+
+
⏱️
+
0.00s
+
Duration
+
+
+
+
+
+
+
📊 Criterion Breakdown
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/eval_reports/s-file_20260610_233004.json b/eval_reports/s-file_20260610_233004.json
new file mode 100644
index 0000000..33b19b9
--- /dev/null
+++ b/eval_reports/s-file_20260610_233004.json
@@ -0,0 +1,123 @@
+{
+ "eval_set_id": "s-file",
+ "eval_set_name": "",
+ "results": [
+ {
+ "eval_id": "c1",
+ "name": "",
+ "passed": true,
+ "criterion_results": [],
+ "actual_trajectory": [],
+ "actual_tool_calls": [],
+ "actual_response": "",
+ "messages": [],
+ "node_responses": [],
+ "node_visits": [],
+ "duration_seconds": 0.0,
+ "error": null,
+ "metadata": {},
+ "turn_results": [],
+ "token_usage": {
+ "input_tokens": 0,
+ "output_tokens": 0,
+ "cache_read_tokens": 0,
+ "cache_creation_tokens": 0,
+ "total_tokens": 0
+ },
+ "agent_token_usage": {
+ "input_tokens": 0,
+ "output_tokens": 0,
+ "cache_read_tokens": 0,
+ "cache_creation_tokens": 0,
+ "total_tokens": 0
+ },
+ "node_details": []
+ }
+ ],
+ "summary": {
+ "total_cases": 1,
+ "passed_cases": 1,
+ "failed_cases": 0,
+ "error_cases": 0,
+ "pass_rate": 1.0,
+ "avg_duration_seconds": 0.0,
+ "total_duration_seconds": 0.0,
+ "criterion_stats": {},
+ "total_token_usage": {
+ "input_tokens": 0,
+ "output_tokens": 0,
+ "cache_read_tokens": 0,
+ "cache_creation_tokens": 0,
+ "total_tokens": 0
+ },
+ "per_case_token_usage": {
+ "c1": {
+ "input_tokens": 0,
+ "output_tokens": 0,
+ "cache_read_tokens": 0,
+ "cache_creation_tokens": 0,
+ "total_tokens": 0
+ }
+ },
+ "avg_tokens_per_case": 0.0
+ },
+ "config_used": {
+ "criteria": {
+ "tool_name_match": null,
+ "trajectory": {
+ "threshold": 1.0,
+ "match_type": "EXACT",
+ "judge_model": "gemini-2.5-flash",
+ "num_samples": 3,
+ "rubrics": [],
+ "keywords": [],
+ "check_args": false,
+ "enabled": true,
+ "api_style": "responses"
+ },
+ "node_order": null,
+ "response_match": {
+ "threshold": 0.8,
+ "match_type": "EXACT",
+ "judge_model": "gemini-2.5-flash",
+ "num_samples": 3,
+ "rubrics": [],
+ "keywords": [],
+ "check_args": false,
+ "enabled": true,
+ "api_style": "responses"
+ },
+ "rouge_match": null,
+ "contains_keywords": null,
+ "llm_judge": null,
+ "rubric_based": null,
+ "factual_accuracy": null,
+ "hallucination": null,
+ "safety": null,
+ "simulation_goals": null
+ },
+ "user_simulator_config": null,
+ "parallel": false,
+ "max_concurrency": 4,
+ "timeout": 300.0,
+ "verbose": false,
+ "mock_mode": false,
+ "reporter": {
+ "enabled": true,
+ "output_dir": "eval_reports",
+ "console": true,
+ "json_report": true,
+ "html": true,
+ "junit_xml": false,
+ "verbose": true,
+ "include_details": true,
+ "include_trajectory": true,
+ "include_node_responses": true,
+ "include_actual_response": true,
+ "include_tool_call_details": true,
+ "timestamp_files": true
+ }
+ },
+ "timestamp": 1781112604.7925491,
+ "metadata": {}
+}
\ No newline at end of file
diff --git a/eval_reports/s-file_20260610_234128.html b/eval_reports/s-file_20260610_234128.html
new file mode 100644
index 0000000..e82b5cb
--- /dev/null
+++ b/eval_reports/s-file_20260610_234128.html
@@ -0,0 +1,954 @@
+
+
+
+
+
+ s-file
+
+
+
+
+
+
+
+
+
+
+
+
+
+
📈
+
100%
+
Pass Rate
+
+
+
+
⏱️
+
0.00s
+
Duration
+
+
+
+
+
+
+
📊 Criterion Breakdown
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/eval_reports/s-file_20260610_234128.json b/eval_reports/s-file_20260610_234128.json
new file mode 100644
index 0000000..dd3a2ad
--- /dev/null
+++ b/eval_reports/s-file_20260610_234128.json
@@ -0,0 +1,123 @@
+{
+ "eval_set_id": "s-file",
+ "eval_set_name": "",
+ "results": [
+ {
+ "eval_id": "c1",
+ "name": "",
+ "passed": true,
+ "criterion_results": [],
+ "actual_trajectory": [],
+ "actual_tool_calls": [],
+ "actual_response": "",
+ "messages": [],
+ "node_responses": [],
+ "node_visits": [],
+ "duration_seconds": 0.0,
+ "error": null,
+ "metadata": {},
+ "turn_results": [],
+ "token_usage": {
+ "input_tokens": 0,
+ "output_tokens": 0,
+ "cache_read_tokens": 0,
+ "cache_creation_tokens": 0,
+ "total_tokens": 0
+ },
+ "agent_token_usage": {
+ "input_tokens": 0,
+ "output_tokens": 0,
+ "cache_read_tokens": 0,
+ "cache_creation_tokens": 0,
+ "total_tokens": 0
+ },
+ "node_details": []
+ }
+ ],
+ "summary": {
+ "total_cases": 1,
+ "passed_cases": 1,
+ "failed_cases": 0,
+ "error_cases": 0,
+ "pass_rate": 1.0,
+ "avg_duration_seconds": 0.0,
+ "total_duration_seconds": 0.0,
+ "criterion_stats": {},
+ "total_token_usage": {
+ "input_tokens": 0,
+ "output_tokens": 0,
+ "cache_read_tokens": 0,
+ "cache_creation_tokens": 0,
+ "total_tokens": 0
+ },
+ "per_case_token_usage": {
+ "c1": {
+ "input_tokens": 0,
+ "output_tokens": 0,
+ "cache_read_tokens": 0,
+ "cache_creation_tokens": 0,
+ "total_tokens": 0
+ }
+ },
+ "avg_tokens_per_case": 0.0
+ },
+ "config_used": {
+ "criteria": {
+ "tool_name_match": null,
+ "trajectory": {
+ "threshold": 1.0,
+ "match_type": "EXACT",
+ "judge_model": "gemini-2.5-flash",
+ "num_samples": 3,
+ "rubrics": [],
+ "keywords": [],
+ "check_args": false,
+ "enabled": true,
+ "api_style": "responses"
+ },
+ "node_order": null,
+ "response_match": {
+ "threshold": 0.8,
+ "match_type": "EXACT",
+ "judge_model": "gemini-2.5-flash",
+ "num_samples": 3,
+ "rubrics": [],
+ "keywords": [],
+ "check_args": false,
+ "enabled": true,
+ "api_style": "responses"
+ },
+ "rouge_match": null,
+ "contains_keywords": null,
+ "llm_judge": null,
+ "rubric_based": null,
+ "factual_accuracy": null,
+ "hallucination": null,
+ "safety": null,
+ "simulation_goals": null
+ },
+ "user_simulator_config": null,
+ "parallel": false,
+ "max_concurrency": 4,
+ "timeout": 300.0,
+ "verbose": false,
+ "mock_mode": false,
+ "reporter": {
+ "enabled": true,
+ "output_dir": "eval_reports",
+ "console": true,
+ "json_report": true,
+ "html": true,
+ "junit_xml": false,
+ "verbose": true,
+ "include_details": true,
+ "include_trajectory": true,
+ "include_node_responses": true,
+ "include_actual_response": true,
+ "include_tool_call_details": true,
+ "timestamp_files": true
+ }
+ },
+ "timestamp": 1781113288.1752944,
+ "metadata": {}
+}
\ No newline at end of file
diff --git a/eval_reports/s-file_20260611_000420.html b/eval_reports/s-file_20260611_000420.html
new file mode 100644
index 0000000..9c2aa21
--- /dev/null
+++ b/eval_reports/s-file_20260611_000420.html
@@ -0,0 +1,954 @@
+
+
+
+
+
+ s-file
+
+
+
+
+
+
+
+
+
+
+
+
+
+
📈
+
100%
+
Pass Rate
+
+
+
+
⏱️
+
0.00s
+
Duration
+
+
+
+
+
+
+
📊 Criterion Breakdown
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/eval_reports/s-file_20260611_000420.json b/eval_reports/s-file_20260611_000420.json
new file mode 100644
index 0000000..1310bb8
--- /dev/null
+++ b/eval_reports/s-file_20260611_000420.json
@@ -0,0 +1,123 @@
+{
+ "eval_set_id": "s-file",
+ "eval_set_name": "",
+ "results": [
+ {
+ "eval_id": "c1",
+ "name": "",
+ "passed": true,
+ "criterion_results": [],
+ "actual_trajectory": [],
+ "actual_tool_calls": [],
+ "actual_response": "",
+ "messages": [],
+ "node_responses": [],
+ "node_visits": [],
+ "duration_seconds": 0.0,
+ "error": null,
+ "metadata": {},
+ "turn_results": [],
+ "token_usage": {
+ "input_tokens": 0,
+ "output_tokens": 0,
+ "cache_read_tokens": 0,
+ "cache_creation_tokens": 0,
+ "total_tokens": 0
+ },
+ "agent_token_usage": {
+ "input_tokens": 0,
+ "output_tokens": 0,
+ "cache_read_tokens": 0,
+ "cache_creation_tokens": 0,
+ "total_tokens": 0
+ },
+ "node_details": []
+ }
+ ],
+ "summary": {
+ "total_cases": 1,
+ "passed_cases": 1,
+ "failed_cases": 0,
+ "error_cases": 0,
+ "pass_rate": 1.0,
+ "avg_duration_seconds": 0.0,
+ "total_duration_seconds": 0.0,
+ "criterion_stats": {},
+ "total_token_usage": {
+ "input_tokens": 0,
+ "output_tokens": 0,
+ "cache_read_tokens": 0,
+ "cache_creation_tokens": 0,
+ "total_tokens": 0
+ },
+ "per_case_token_usage": {
+ "c1": {
+ "input_tokens": 0,
+ "output_tokens": 0,
+ "cache_read_tokens": 0,
+ "cache_creation_tokens": 0,
+ "total_tokens": 0
+ }
+ },
+ "avg_tokens_per_case": 0.0
+ },
+ "config_used": {
+ "criteria": {
+ "tool_name_match": null,
+ "trajectory": {
+ "threshold": 1.0,
+ "match_type": "EXACT",
+ "judge_model": "gemini-2.5-flash",
+ "num_samples": 3,
+ "rubrics": [],
+ "keywords": [],
+ "check_args": false,
+ "enabled": true,
+ "api_style": "responses"
+ },
+ "node_order": null,
+ "response_match": {
+ "threshold": 0.8,
+ "match_type": "EXACT",
+ "judge_model": "gemini-2.5-flash",
+ "num_samples": 3,
+ "rubrics": [],
+ "keywords": [],
+ "check_args": false,
+ "enabled": true,
+ "api_style": "responses"
+ },
+ "rouge_match": null,
+ "contains_keywords": null,
+ "llm_judge": null,
+ "rubric_based": null,
+ "factual_accuracy": null,
+ "hallucination": null,
+ "safety": null,
+ "simulation_goals": null
+ },
+ "user_simulator_config": null,
+ "parallel": false,
+ "max_concurrency": 4,
+ "timeout": 300.0,
+ "verbose": false,
+ "mock_mode": false,
+ "reporter": {
+ "enabled": true,
+ "output_dir": "eval_reports",
+ "console": true,
+ "json_report": true,
+ "html": true,
+ "junit_xml": false,
+ "verbose": true,
+ "include_details": true,
+ "include_trajectory": true,
+ "include_node_responses": true,
+ "include_actual_response": true,
+ "include_tool_call_details": true,
+ "timestamp_files": true
+ }
+ },
+ "timestamp": 1781114660.4083369,
+ "metadata": {}
+}
\ No newline at end of file
diff --git a/eval_reports/s-file_20260611_000529.html b/eval_reports/s-file_20260611_000529.html
new file mode 100644
index 0000000..bbce957
--- /dev/null
+++ b/eval_reports/s-file_20260611_000529.html
@@ -0,0 +1,954 @@
+
+
+
+
+
+ s-file
+
+
+
+
+
+
+
+
+
+
+
+
+
+
📈
+
100%
+
Pass Rate
+
+
+
+
⏱️
+
0.00s
+
Duration
+
+
+
+
+
+
+
📊 Criterion Breakdown
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/eval_reports/s-file_20260611_000529.json b/eval_reports/s-file_20260611_000529.json
new file mode 100644
index 0000000..694b176
--- /dev/null
+++ b/eval_reports/s-file_20260611_000529.json
@@ -0,0 +1,123 @@
+{
+ "eval_set_id": "s-file",
+ "eval_set_name": "",
+ "results": [
+ {
+ "eval_id": "c1",
+ "name": "",
+ "passed": true,
+ "criterion_results": [],
+ "actual_trajectory": [],
+ "actual_tool_calls": [],
+ "actual_response": "",
+ "messages": [],
+ "node_responses": [],
+ "node_visits": [],
+ "duration_seconds": 0.0,
+ "error": null,
+ "metadata": {},
+ "turn_results": [],
+ "token_usage": {
+ "input_tokens": 0,
+ "output_tokens": 0,
+ "cache_read_tokens": 0,
+ "cache_creation_tokens": 0,
+ "total_tokens": 0
+ },
+ "agent_token_usage": {
+ "input_tokens": 0,
+ "output_tokens": 0,
+ "cache_read_tokens": 0,
+ "cache_creation_tokens": 0,
+ "total_tokens": 0
+ },
+ "node_details": []
+ }
+ ],
+ "summary": {
+ "total_cases": 1,
+ "passed_cases": 1,
+ "failed_cases": 0,
+ "error_cases": 0,
+ "pass_rate": 1.0,
+ "avg_duration_seconds": 0.0,
+ "total_duration_seconds": 0.0,
+ "criterion_stats": {},
+ "total_token_usage": {
+ "input_tokens": 0,
+ "output_tokens": 0,
+ "cache_read_tokens": 0,
+ "cache_creation_tokens": 0,
+ "total_tokens": 0
+ },
+ "per_case_token_usage": {
+ "c1": {
+ "input_tokens": 0,
+ "output_tokens": 0,
+ "cache_read_tokens": 0,
+ "cache_creation_tokens": 0,
+ "total_tokens": 0
+ }
+ },
+ "avg_tokens_per_case": 0.0
+ },
+ "config_used": {
+ "criteria": {
+ "tool_name_match": null,
+ "trajectory": {
+ "threshold": 1.0,
+ "match_type": "EXACT",
+ "judge_model": "gemini-2.5-flash",
+ "num_samples": 3,
+ "rubrics": [],
+ "keywords": [],
+ "check_args": false,
+ "enabled": true,
+ "api_style": "responses"
+ },
+ "node_order": null,
+ "response_match": {
+ "threshold": 0.8,
+ "match_type": "EXACT",
+ "judge_model": "gemini-2.5-flash",
+ "num_samples": 3,
+ "rubrics": [],
+ "keywords": [],
+ "check_args": false,
+ "enabled": true,
+ "api_style": "responses"
+ },
+ "rouge_match": null,
+ "contains_keywords": null,
+ "llm_judge": null,
+ "rubric_based": null,
+ "factual_accuracy": null,
+ "hallucination": null,
+ "safety": null,
+ "simulation_goals": null
+ },
+ "user_simulator_config": null,
+ "parallel": false,
+ "max_concurrency": 4,
+ "timeout": 300.0,
+ "verbose": false,
+ "mock_mode": false,
+ "reporter": {
+ "enabled": true,
+ "output_dir": "eval_reports",
+ "console": true,
+ "json_report": true,
+ "html": true,
+ "junit_xml": false,
+ "verbose": true,
+ "include_details": true,
+ "include_trajectory": true,
+ "include_node_responses": true,
+ "include_actual_response": true,
+ "include_tool_call_details": true,
+ "timestamp_files": true
+ }
+ },
+ "timestamp": 1781114729.9052248,
+ "metadata": {}
+}
\ No newline at end of file
diff --git a/eval_reports/s-file_20260611_001112.html b/eval_reports/s-file_20260611_001112.html
new file mode 100644
index 0000000..642c8f8
--- /dev/null
+++ b/eval_reports/s-file_20260611_001112.html
@@ -0,0 +1,954 @@
+
+
+
+
+
+ s-file
+
+
+
+
+
+
+
+
+
+
+
+
+
+
📈
+
100%
+
Pass Rate
+
+
+
+
⏱️
+
0.00s
+
Duration
+
+
+
+
+
+
+
📊 Criterion Breakdown
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/eval_reports/s-file_20260611_001112.json b/eval_reports/s-file_20260611_001112.json
new file mode 100644
index 0000000..c7174a5
--- /dev/null
+++ b/eval_reports/s-file_20260611_001112.json
@@ -0,0 +1,123 @@
+{
+ "eval_set_id": "s-file",
+ "eval_set_name": "",
+ "results": [
+ {
+ "eval_id": "c1",
+ "name": "",
+ "passed": true,
+ "criterion_results": [],
+ "actual_trajectory": [],
+ "actual_tool_calls": [],
+ "actual_response": "",
+ "messages": [],
+ "node_responses": [],
+ "node_visits": [],
+ "duration_seconds": 0.0,
+ "error": null,
+ "metadata": {},
+ "turn_results": [],
+ "token_usage": {
+ "input_tokens": 0,
+ "output_tokens": 0,
+ "cache_read_tokens": 0,
+ "cache_creation_tokens": 0,
+ "total_tokens": 0
+ },
+ "agent_token_usage": {
+ "input_tokens": 0,
+ "output_tokens": 0,
+ "cache_read_tokens": 0,
+ "cache_creation_tokens": 0,
+ "total_tokens": 0
+ },
+ "node_details": []
+ }
+ ],
+ "summary": {
+ "total_cases": 1,
+ "passed_cases": 1,
+ "failed_cases": 0,
+ "error_cases": 0,
+ "pass_rate": 1.0,
+ "avg_duration_seconds": 0.0,
+ "total_duration_seconds": 0.0,
+ "criterion_stats": {},
+ "total_token_usage": {
+ "input_tokens": 0,
+ "output_tokens": 0,
+ "cache_read_tokens": 0,
+ "cache_creation_tokens": 0,
+ "total_tokens": 0
+ },
+ "per_case_token_usage": {
+ "c1": {
+ "input_tokens": 0,
+ "output_tokens": 0,
+ "cache_read_tokens": 0,
+ "cache_creation_tokens": 0,
+ "total_tokens": 0
+ }
+ },
+ "avg_tokens_per_case": 0.0
+ },
+ "config_used": {
+ "criteria": {
+ "tool_name_match": null,
+ "trajectory": {
+ "threshold": 1.0,
+ "match_type": "EXACT",
+ "judge_model": "gemini-2.5-flash",
+ "num_samples": 3,
+ "rubrics": [],
+ "keywords": [],
+ "check_args": false,
+ "enabled": true,
+ "api_style": "responses"
+ },
+ "node_order": null,
+ "response_match": {
+ "threshold": 0.8,
+ "match_type": "EXACT",
+ "judge_model": "gemini-2.5-flash",
+ "num_samples": 3,
+ "rubrics": [],
+ "keywords": [],
+ "check_args": false,
+ "enabled": true,
+ "api_style": "responses"
+ },
+ "rouge_match": null,
+ "contains_keywords": null,
+ "llm_judge": null,
+ "rubric_based": null,
+ "factual_accuracy": null,
+ "hallucination": null,
+ "safety": null,
+ "simulation_goals": null
+ },
+ "user_simulator_config": null,
+ "parallel": false,
+ "max_concurrency": 4,
+ "timeout": 300.0,
+ "verbose": false,
+ "mock_mode": false,
+ "reporter": {
+ "enabled": true,
+ "output_dir": "eval_reports",
+ "console": true,
+ "json_report": true,
+ "html": true,
+ "junit_xml": false,
+ "verbose": true,
+ "include_details": true,
+ "include_trajectory": true,
+ "include_node_responses": true,
+ "include_actual_response": true,
+ "include_tool_call_details": true,
+ "timestamp_files": true
+ }
+ },
+ "timestamp": 1781115072.5616362,
+ "metadata": {}
+}
\ No newline at end of file
diff --git a/eval_reports/s-file_20260611_001220.html b/eval_reports/s-file_20260611_001220.html
new file mode 100644
index 0000000..a494286
--- /dev/null
+++ b/eval_reports/s-file_20260611_001220.html
@@ -0,0 +1,954 @@
+
+
+
+
+
+ s-file
+
+
+
+
+
+
+
+
+
+
+
+
+
+
📈
+
100%
+
Pass Rate
+
+
+
+
⏱️
+
0.00s
+
Duration
+
+
+
+
+
+
+
📊 Criterion Breakdown
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/eval_reports/s-file_20260611_001220.json b/eval_reports/s-file_20260611_001220.json
new file mode 100644
index 0000000..74a23de
--- /dev/null
+++ b/eval_reports/s-file_20260611_001220.json
@@ -0,0 +1,123 @@
+{
+ "eval_set_id": "s-file",
+ "eval_set_name": "",
+ "results": [
+ {
+ "eval_id": "c1",
+ "name": "",
+ "passed": true,
+ "criterion_results": [],
+ "actual_trajectory": [],
+ "actual_tool_calls": [],
+ "actual_response": "",
+ "messages": [],
+ "node_responses": [],
+ "node_visits": [],
+ "duration_seconds": 0.0,
+ "error": null,
+ "metadata": {},
+ "turn_results": [],
+ "token_usage": {
+ "input_tokens": 0,
+ "output_tokens": 0,
+ "cache_read_tokens": 0,
+ "cache_creation_tokens": 0,
+ "total_tokens": 0
+ },
+ "agent_token_usage": {
+ "input_tokens": 0,
+ "output_tokens": 0,
+ "cache_read_tokens": 0,
+ "cache_creation_tokens": 0,
+ "total_tokens": 0
+ },
+ "node_details": []
+ }
+ ],
+ "summary": {
+ "total_cases": 1,
+ "passed_cases": 1,
+ "failed_cases": 0,
+ "error_cases": 0,
+ "pass_rate": 1.0,
+ "avg_duration_seconds": 0.0,
+ "total_duration_seconds": 0.0,
+ "criterion_stats": {},
+ "total_token_usage": {
+ "input_tokens": 0,
+ "output_tokens": 0,
+ "cache_read_tokens": 0,
+ "cache_creation_tokens": 0,
+ "total_tokens": 0
+ },
+ "per_case_token_usage": {
+ "c1": {
+ "input_tokens": 0,
+ "output_tokens": 0,
+ "cache_read_tokens": 0,
+ "cache_creation_tokens": 0,
+ "total_tokens": 0
+ }
+ },
+ "avg_tokens_per_case": 0.0
+ },
+ "config_used": {
+ "criteria": {
+ "tool_name_match": null,
+ "trajectory": {
+ "threshold": 1.0,
+ "match_type": "EXACT",
+ "judge_model": "gemini-2.5-flash",
+ "num_samples": 3,
+ "rubrics": [],
+ "keywords": [],
+ "check_args": false,
+ "enabled": true,
+ "api_style": "responses"
+ },
+ "node_order": null,
+ "response_match": {
+ "threshold": 0.8,
+ "match_type": "EXACT",
+ "judge_model": "gemini-2.5-flash",
+ "num_samples": 3,
+ "rubrics": [],
+ "keywords": [],
+ "check_args": false,
+ "enabled": true,
+ "api_style": "responses"
+ },
+ "rouge_match": null,
+ "contains_keywords": null,
+ "llm_judge": null,
+ "rubric_based": null,
+ "factual_accuracy": null,
+ "hallucination": null,
+ "safety": null,
+ "simulation_goals": null
+ },
+ "user_simulator_config": null,
+ "parallel": false,
+ "max_concurrency": 4,
+ "timeout": 300.0,
+ "verbose": false,
+ "mock_mode": false,
+ "reporter": {
+ "enabled": true,
+ "output_dir": "eval_reports",
+ "console": true,
+ "json_report": true,
+ "html": true,
+ "junit_xml": false,
+ "verbose": true,
+ "include_details": true,
+ "include_trajectory": true,
+ "include_node_responses": true,
+ "include_actual_response": true,
+ "include_tool_call_details": true,
+ "timestamp_files": true
+ }
+ },
+ "timestamp": 1781115140.0770233,
+ "metadata": {}
+}
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index 083dd63..76a4b0b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
[project]
name = "10xscale-agentflow"
-version = "0.7.5.0"
+version = "0.7.5.1"
description = "Production-grade Python framework for building, orchestrating, and deploying multi-agent LLM systems. A simpler, batteries-included alternative to LangGraph, CrewAI, and AutoGen with graph-based workflows, durable state, native MCP support, and provider-agnostic LLM integration (OpenAI, Google GenAI, Anthropic)."
readme = "README.md"
keywords = [
diff --git a/tests/adapters/test_openai_converter.py b/tests/adapters/test_openai_converter.py
index eecdd8a..a5d7ac4 100644
--- a/tests/adapters/test_openai_converter.py
+++ b/tests/adapters/test_openai_converter.py
@@ -438,3 +438,175 @@ async def test_no_token_details_defaults_to_zero(self):
assert msg.usages.reasoning_tokens == 0
assert msg.usages.cache_read_input_tokens == 0
assert msg.usages.cache_creation_input_tokens == 0
+
+
+@pytest.fixture
+def converter():
+ return OpenAIConverter()
+
+
+@pytest.mark.asyncio
+async def test_openai_converter_import_errors(monkeypatch):
+ monkeypatch.setattr("agentflow.runtime.adapters.llm.openai_converter.HAS_OPENAI", False)
+ converter = OpenAIConverter()
+
+ with pytest.raises(ImportError, match="openai is not installed"):
+ await converter.convert_response(Mock())
+
+ with pytest.raises(ImportError, match="openai is not installed"):
+ async for _ in converter.convert_streaming_response({}, "node", Mock()):
+ pass
+
+
+def test_extract_audio_block_exceptions(converter):
+ assert converter._extract_audio_block({"transcript": "hi"}) is None
+ assert converter._extract_audio_block(object()) is None
+
+
+def test_extract_image_blocks_various_types(converter):
+ img = "https://example.com/single.png"
+ blocks = converter._extract_image_blocks(img)
+ assert len(blocks) == 1
+ assert blocks[0].media.url == img
+
+ with patch("agentflow.runtime.adapters.llm.openai_converter.isinstance", side_effect=TypeError("mock error")):
+ blocks2 = converter._extract_image_blocks("url")
+ assert blocks2 == []
+
+
+@pytest.mark.asyncio
+async def test_streaming_conversion(converter):
+ class MockChunk:
+ def __init__(self, id, content=None, reasoning=None, tool_calls=None):
+ self.id = id
+ self.model = "gpt-4o"
+
+ delta_obj = type("Delta", (), {
+ "content": content,
+ "reasoning_content": reasoning,
+ "tool_calls": tool_calls,
+ "audio": None,
+ "images": None
+ })
+ choice = type("Choice", (), {
+ "delta": delta_obj
+ })
+ self.choices = [choice]
+
+ chunk1 = MockChunk("chat-1", reasoning="Thinking...")
+ chunk2 = MockChunk("chat-1", content="Hello ")
+ chunk3 = MockChunk("chat-1", content="world!")
+
+ tool_call_mock = type("ToolCall", (), {
+ "id": "tc-123",
+ "type": "function",
+ "function": type("Func", (), {
+ "name": "calc",
+ "arguments": '{"x": 1}'
+ })
+ })
+ chunk4 = MockChunk("chat-1", tool_calls=[tool_call_mock])
+
+ async def mock_async_stream():
+ yield chunk1
+ yield chunk2
+ yield chunk3
+ yield chunk4
+
+ messages = []
+ async for msg in converter.convert_streaming_response({}, "my_node", mock_async_stream()):
+ messages.append(msg)
+
+ assert len(messages) == 5
+ assert messages[0].reasoning == "Thinking..."
+ assert messages[1].content[0].text == "Hello "
+ assert messages[2].content[0].text == "world!"
+ assert messages[3].tools_calls[0]["id"] == "tc-123"
+
+ final_msg = messages[-1]
+ assert final_msg.delta is False
+ assert final_msg.reasoning == "Thinking..."
+ assert final_msg.content[0].text == "Hello world!"
+ assert final_msg.content[1].summary == "Thinking..."
+ assert final_msg.content[2].name == "calc"
+
+
+@pytest.mark.asyncio
+async def test_streaming_inline_think_thought(converter):
+ class MockChunk:
+ def __init__(self, id, content):
+ self.id = id
+ delta_obj = type("Delta", (), {
+ "content": content,
+ "reasoning_content": None,
+ "tool_calls": None,
+ "audio": None,
+ "images": None
+ })
+ self.choices = [type("Choice", (), {"delta": delta_obj})]
+
+ async def mock_stream():
+ yield MockChunk("chat-2", "Inline thoughtsActual content")
+
+ messages = []
+ async for msg in converter.convert_streaming_response({}, "node", mock_stream()):
+ messages.append(msg)
+
+ final_msg = messages[-1]
+ assert final_msg.reasoning == "Inline thoughts"
+ assert final_msg.content[0].text == "Actual content"
+
+
+@pytest.mark.asyncio
+async def test_streaming_sync_iterator(converter):
+ class MockChunk:
+ def __init__(self, id, content):
+ self.id = id
+ delta_obj = type("Delta", (), {
+ "content": content,
+ "reasoning_content": None,
+ "tool_calls": None,
+ "audio": None,
+ "images": None
+ })
+ self.choices = [type("Choice", (), {"delta": delta_obj})]
+
+ class SyncStream:
+ def __init__(self, chunks):
+ self.chunks = chunks
+ def __iter__(self):
+ return iter(self.chunks)
+
+ stream = SyncStream([MockChunk("chat-3", "Sync chunk")])
+ messages = []
+ async for msg in converter.convert_streaming_response({}, "node", stream):
+ messages.append(msg)
+
+ assert len(messages) == 2
+ assert messages[0].content[0].text == "Sync chunk"
+
+
+@pytest.mark.asyncio
+async def test_convert_streaming_response_chat_completion(converter):
+ response_data = {
+ "id": "chatcmpl-123",
+ "model": "gpt-4o",
+ "choices": [{"message": {"role": "assistant", "content": "Hello"}}],
+ "usage": {}
+ }
+ response = MockModelResponse(response_data)
+ messages = []
+ with patch("agentflow.runtime.adapters.llm.openai_converter.ChatCompletion", MockModelResponse):
+ async for msg in converter.convert_streaming_response({}, "node", response):
+ messages.append(msg)
+
+ assert len(messages) == 1
+ assert messages[0].content[0].text == "Hello"
+
+
+@pytest.mark.asyncio
+async def test_convert_streaming_response_unsupported(converter):
+ with pytest.raises(Exception, match="Unsupported response type"):
+ async for _ in converter.convert_streaming_response({}, "node", object()):
+ pass
+
diff --git a/tests/checkpointer/test_base_checkpointer.py b/tests/checkpointer/test_base_checkpointer.py
new file mode 100644
index 0000000..e4cdcfe
--- /dev/null
+++ b/tests/checkpointer/test_base_checkpointer.py
@@ -0,0 +1,129 @@
+import pytest
+from unittest.mock import AsyncMock, MagicMock
+from agentflow.storage.checkpointer.base_checkpointer import BaseCheckpointer
+from agentflow.core.state import AgentState, Message
+from agentflow.utils.thread_info import ThreadInfo
+
+
+class DummyCheckpointer(BaseCheckpointer):
+ async def asetup(self): pass
+ async def aput_state(self, config, state): pass
+ async def aget_state(self, config): pass
+ async def aclear_state(self, config): pass
+ async def aput_state_cache(self, config, state): pass
+ async def aget_state_cache(self, config): pass
+ async def aput_messages(self, config, messages, metadata=None): pass
+ async def aget_message(self, config, message_id): pass
+ async def alist_messages(self, config, search=None, offset=None, limit=None): pass
+ async def adelete_message(self, config, message_id): pass
+ async def aput_thread(self, config, thread_info): pass
+ async def aget_thread(self, config): pass
+ async def alist_threads(self, config, search=None, offset=None, limit=None): pass
+ async def aclean_thread(self, config): pass
+ async def arelease(self): pass
+
+
+class MinimalCheckpointer(BaseCheckpointer):
+ async def asetup(self): pass
+ async def aput_state(self, config, state): pass
+ async def aget_state(self, config): pass
+ async def aclear_state(self, config): pass
+ async def aput_state_cache(self, config, state): pass
+ async def aget_state_cache(self, config): pass
+ async def aput_messages(self, config, messages, metadata=None): pass
+ async def aget_message(self, config, message_id): pass
+ async def alist_messages(self, config, search=None, offset=None, limit=None): pass
+ async def adelete_message(self, config, message_id): pass
+ async def aput_thread(self, config, thread_info): pass
+ async def aget_thread(self, config): pass
+ async def alist_threads(self, config, search=None, offset=None, limit=None): pass
+ async def aclean_thread(self, config): pass
+ async def arelease(self): pass
+
+
+def test_base_checkpointer_sync_wrappers():
+ cp = DummyCheckpointer()
+ cp.asetup = AsyncMock()
+ cp.aput_state = AsyncMock()
+ cp.aget_state = AsyncMock()
+ cp.aclear_state = AsyncMock()
+ cp.aput_state_cache = AsyncMock()
+ cp.aget_state_cache = AsyncMock()
+ cp.aput_messages = AsyncMock()
+ cp.aget_message = AsyncMock()
+ cp.alist_messages = AsyncMock()
+ cp.adelete_message = AsyncMock()
+ cp.aput_thread = AsyncMock()
+ cp.aget_thread = AsyncMock()
+ cp.alist_threads = AsyncMock()
+ cp.aclean_thread = AsyncMock()
+ cp.arelease = AsyncMock()
+
+ config = {"thread_id": "test_thread"}
+ state = MagicMock(spec=AgentState)
+ messages = [MagicMock(spec=Message)]
+ thread_info = MagicMock(spec=ThreadInfo)
+
+ # Test setup
+ cp.setup()
+ cp.asetup.assert_called_once()
+
+ # Test state methods
+ cp.put_state(config, state)
+ cp.aput_state.assert_called_once_with(config, state)
+
+ cp.get_state(config)
+ cp.aget_state.assert_called_once_with(config)
+
+ cp.clear_state(config)
+ cp.aclear_state.assert_called_once_with(config)
+
+ cp.put_state_cache(config, state)
+ cp.aput_state_cache.assert_called_once_with(config, state)
+
+ cp.get_state_cache(config)
+ cp.aget_state_cache.assert_called_once_with(config)
+
+ # Test cache values
+ cp.put_cache_value("ns", "k", "v", ttl_seconds=10)
+ cp.get_cache_value("ns", "k")
+ cp.clear_cache_value("ns", "k")
+
+ # Test message methods
+ cp.put_messages(config, messages, metadata={"meta": 1})
+ cp.aput_messages.assert_called_once_with(config, messages, {"meta": 1})
+
+ cp.get_message(config, "msg1")
+ cp.aget_message.assert_called_once_with(config, "msg1")
+
+ cp.list_messages(config, search="x", offset=1, limit=5)
+ cp.alist_messages.assert_called_once_with(config, "x", 1, 5)
+
+ cp.delete_message(config, "msg1")
+ cp.adelete_message.assert_called_once_with(config, "msg1")
+
+ # Test thread methods
+ cp.put_thread(config, thread_info)
+ cp.aput_thread.assert_called_once_with(config, thread_info)
+
+ cp.get_thread(config)
+ cp.aget_thread.assert_called_once_with(config)
+
+ cp.list_threads(config, search="x", offset=1, limit=5)
+ cp.alist_threads.assert_called_once_with(config, "x", 1, 5)
+
+ cp.clean_thread(config)
+ cp.aclean_thread.assert_called_once_with(config)
+
+ # Test release
+ cp.release()
+ cp.arelease.assert_called_once()
+
+
+@pytest.mark.asyncio
+async def test_base_checkpointer_default_cache_methods():
+ cp = MinimalCheckpointer()
+ assert await cp.aput_cache_value("ns", "k", "v") is None
+ assert await cp.aget_cache_value("ns", "k") is None
+ assert await cp.aclear_cache_value("ns", "k") is None
+ assert await cp.alist_cache_keys("ns") == []
diff --git a/tests/checkpointer/test_pg_checkpointer_extra.py b/tests/checkpointer/test_pg_checkpointer_extra.py
index aa95370..c094cdb 100644
--- a/tests/checkpointer/test_pg_checkpointer_extra.py
+++ b/tests/checkpointer/test_pg_checkpointer_extra.py
@@ -1,5 +1,6 @@
import json
-from unittest.mock import AsyncMock, MagicMock
+from enum import Enum
+from unittest.mock import AsyncMock, MagicMock, patch
import pytest
@@ -195,3 +196,162 @@ async def test_aget_message_not_found_raises(cp):
with pytest.raises(ValueError):
await cp.aget_message({"thread_id": "t1"}, "missing")
+
+
+def test_import_error_asyncpg(monkeypatch):
+ monkeypatch.setattr("agentflow.storage.checkpointer.pg_checkpointer.HAS_ASYNCPG", False)
+ with pytest.raises(ImportError) as exc:
+ PgCheckpointer(postgres_dsn="postgres://x")
+ assert "requires 'asyncpg'" in str(exc.value)
+
+
+def test_import_error_redis(monkeypatch):
+ monkeypatch.setattr("agentflow.storage.checkpointer.pg_checkpointer.HAS_ASYNCPG", True)
+ monkeypatch.setattr("agentflow.storage.checkpointer.pg_checkpointer.HAS_REDIS", False)
+ with pytest.raises(ImportError) as exc:
+ PgCheckpointer(postgres_dsn="postgres://x")
+ assert "requires 'redis'" in str(exc.value)
+
+
+def test_schema_name_validation_on_init(monkeypatch):
+ monkeypatch.setattr("agentflow.storage.checkpointer.pg_checkpointer.HAS_ASYNCPG", True)
+ monkeypatch.setattr("agentflow.storage.checkpointer.pg_checkpointer.HAS_REDIS", True)
+ with pytest.raises(ValueError):
+ PgCheckpointer(postgres_dsn="postgres://x", redis=MagicMock(), schema="invalid-schema-name")
+
+
+def test_init_with_pools(monkeypatch):
+ monkeypatch.setattr("agentflow.storage.checkpointer.pg_checkpointer.HAS_ASYNCPG", True)
+ monkeypatch.setattr("agentflow.storage.checkpointer.pg_checkpointer.HAS_REDIS", True)
+
+ mock_pg_pool = MagicMock()
+ mock_redis_pool = MagicMock()
+
+ with patch("agentflow.storage.checkpointer.pg_checkpointer.Redis") as mock_redis_class:
+ cp = PgCheckpointer(pg_pool=mock_pg_pool, redis_pool=mock_redis_pool)
+ assert cp._pg_pool is mock_pg_pool
+ mock_redis_class.assert_called_once_with(connection_pool=mock_redis_pool)
+
+
+def test_create_redis_pool_no_url(monkeypatch):
+ monkeypatch.setattr("agentflow.storage.checkpointer.pg_checkpointer.HAS_ASYNCPG", True)
+ monkeypatch.setattr("agentflow.storage.checkpointer.pg_checkpointer.HAS_REDIS", True)
+
+ cp = PgCheckpointer(postgres_dsn="postgres://x", redis=MagicMock())
+ with pytest.raises(ValueError):
+ cp._create_redis_pool(redis=None, redis_pool=None, redis_url=None, redis_pool_config={})
+
+
+def test_create_pg_pool(cp):
+ mock_pool = MagicMock()
+ assert cp._create_pg_pool(pg_pool=mock_pool, postgres_dsn=None, pool_config={}) is mock_pool
+
+ with patch("asyncpg.create_pool") as mock_create_pool:
+ cp._create_pg_pool(pg_pool=None, postgres_dsn="postgres://url", pool_config={"min_size": 5})
+ mock_create_pool.assert_called_once_with(dsn="postgres://url", min_size=5)
+
+
+@pytest.mark.asyncio
+async def test_get_pg_pool_lazy(monkeypatch):
+ monkeypatch.setattr("agentflow.storage.checkpointer.pg_checkpointer.HAS_ASYNCPG", True)
+ monkeypatch.setattr("agentflow.storage.checkpointer.pg_checkpointer.HAS_REDIS", True)
+
+ cp = PgCheckpointer(postgres_dsn="postgres://dsn", redis=MagicMock())
+ assert cp._pg_pool is None
+
+ mock_pool = MagicMock()
+ async def mock_create_pool(*args, **kwargs):
+ return mock_pool
+
+ monkeypatch.setattr(cp, "_create_pg_pool", mock_create_pool)
+ pool = await cp._get_pg_pool()
+ assert pool is mock_pool
+ assert cp._pg_pool is mock_pool
+
+
+def test_json_serializer_fast_json_importers(monkeypatch, cp):
+ monkeypatch.setenv("FAST_JSON", "1")
+
+ import builtins
+ real_import = builtins.__import__
+ def mock_import(name, *args, **kwargs):
+ if name in ("orjson", "msgspec"):
+ raise ImportError
+ return real_import(name, *args, **kwargs)
+
+ with patch("builtins.__import__", side_effect=mock_import):
+ serializer = cp._get_json_serializer()
+ assert serializer == json.dumps
+
+
+@pytest.mark.asyncio
+async def test_check_and_apply_schema_version_upgrade(cp):
+ conn = AsyncMock()
+ conn.fetchrow = AsyncMock(return_value={"version": 1})
+
+ with patch.object(cp, "_get_current_schema_version", return_value=2):
+ await cp._check_and_apply_schema_version(conn)
+
+ conn.execute.assert_called_once_with(
+ 'INSERT INTO "public"."schema_version" (version) VALUES ($1)', 2
+ )
+
+
+@pytest.mark.asyncio
+async def test_check_and_apply_schema_version_exception(cp):
+ conn = AsyncMock()
+ conn.fetchrow = AsyncMock(side_effect=RuntimeError("db error"))
+
+ await cp._check_and_apply_schema_version(conn)
+ conn.execute.assert_called_once()
+
+
+@pytest.mark.asyncio
+async def test_initialize_schema_early_return(cp):
+ cp._schema_initialized = True
+ await cp._initialize_schema()
+ assert cp._pg_pool is None or not cp._pg_pool.acquire.called
+
+
+@pytest.mark.asyncio
+async def test_initialize_schema_error(cp):
+ cp._schema_initialized = False
+
+ conn = AsyncMock()
+ conn.execute = AsyncMock(side_effect=RuntimeError("sql error"))
+
+ cp._pg_pool = MagicMock()
+ cp._pg_pool.acquire.return_value = _AcquireCtx(conn)
+
+ with pytest.raises(RuntimeError):
+ await cp._initialize_schema()
+
+
+def test_serialize_state_enum_handler(cp):
+ class MockEnum(Enum):
+ VAL = "enum_val"
+
+ class MockObj:
+ def __str__(self):
+ return "str_obj"
+
+ mock_state = MagicMock()
+ mock_state.model_dump.return_value = {"enum": MockEnum.VAL, "obj": MockObj()}
+ serialized = cp._serialize_state(mock_state)
+ loaded = json.loads(serialized)
+ assert loaded["enum"] == "enum_val"
+ assert loaded["obj"] == "str_obj"
+
+
+def test_deserialize_state_errors(cp):
+ class BadState:
+ @classmethod
+ def model_validate(cls, d):
+ raise TypeError("validation error")
+
+ with pytest.raises(TypeError):
+ cp._deserialize_state({"invalid": "data"}, BadState)
+
+ with pytest.raises(json.JSONDecodeError):
+ cp._deserialize_state("invalid-str", BadState)
+
diff --git a/tests/evaluation/test_phase3_evaluator.py b/tests/evaluation/test_phase3_evaluator.py
index ea65c55..8e36e31 100644
--- a/tests/evaluation/test_phase3_evaluator.py
+++ b/tests/evaluation/test_phase3_evaluator.py
@@ -482,3 +482,305 @@ def test_render_error_case(self):
html = reporter._render_case(result)
assert "error" in html
assert "Something went wrong" in html
+
+
+# ============================================================================
+# Additional Reporter Tests for Coverage
+# ============================================================================
+
+from agentflow.qa.evaluation.dataset.eval_set import ToolCall, TrajectoryStep, StepType
+from agentflow.qa.evaluation.eval_result import NodeDetail
+from agentflow.qa.evaluation.token_usage import TokenUsage
+
+class NodeResponseObj:
+ node_name = "object_node"
+ response_text = "hello from object node"
+ tool_call_names = ["other_tool"]
+ is_final = False
+ has_tool_calls = False
+ timestamp = 140.0
+ input_messages = [{"role": "system", "content": "system_prompt"}]
+
+def build_comprehensive_report():
+ # 1. Tool Call
+ tc = ToolCall(
+ name="test_tool",
+ args={"arg1": "val1"},
+ call_id="call_123",
+ result="success_result"
+ )
+
+ # 2. Trajectory Steps
+ step1 = TrajectoryStep(
+ step_type=StepType.TOOL,
+ name="test_tool",
+ args={"arg1": "val1"},
+ timestamp=100.0,
+ metadata={"meta1": "val1"}
+ )
+
+ # 3. Node Response (dict format)
+ nr_dict = {
+ "node_name": "agent_node",
+ "response_text": "hello from node",
+ "tool_call_names": ["test_tool"],
+ "is_final": True,
+ "has_tool_calls": True,
+ "timestamp": 120.0,
+ "input_messages": [{"role": "user", "content": "hello"}]
+ }
+
+ # 4. Node Detail (object format for node_details)
+ node_detail = NodeDetail(
+ node_name="other_node",
+ input_messages=[{"role": "user", "content": "hi"}],
+ response_text="hi response",
+ token_usage=TokenUsage(input_tokens=10, output_tokens=5),
+ timestamp=130.0
+ )
+
+ # 5. Criterion Results
+ cr1 = CriterionResult(
+ criterion="traj_crit",
+ score=0.4,
+ passed=False,
+ threshold=0.8,
+ details={"reason": "traj failed reason", "extra": "extra_detail"},
+ error="eval error message"
+ )
+ cr2 = CriterionResult.success(
+ criterion="resp_crit",
+ score=0.9,
+ threshold=0.8,
+ details={"reason": "resp passed reason"}
+ )
+
+ # 6. Case Results
+ # Result 1: Failed
+ r1 = EvalCaseResult.success(
+ eval_id="case1",
+ name="Case One",
+ criterion_results=[cr1, cr2],
+ actual_trajectory=[step1],
+ actual_tool_calls=[tc],
+ actual_response="Hello world",
+ messages=[{"role": "user", "content": "query"}, {"role": "assistant", "content": "Hello world"}],
+ node_responses=[],
+ node_visits=["start_node", "agent_node"],
+ duration_seconds=2.5,
+ metadata={"case_meta": "meta_val"},
+ turn_results=[{
+ "turn_index": 0,
+ "user_input": "query",
+ "agent_response": "Hello world",
+ "tool_calls": [{"name": "test_tool"}],
+ "node_visits": ["start_node", "agent_node"]
+ }],
+ node_details=[node_detail]
+ )
+
+ # Assign attributes that bypass basic pydantic constructor validation
+ r1.actual_trajectory = [step1, "simple_trajectory_step"]
+ r1.node_responses = [nr_dict, NodeResponseObj()]
+
+ # Result 2: Error
+ r2 = EvalCaseResult.failure(
+ eval_id="case2",
+ error="Case execution crash",
+ name="Case Two",
+ duration_seconds=1.2
+ )
+
+ # 7. Create report
+ report = EvalReport.create(
+ eval_set_id="test_set_123",
+ eval_set_name="Comprehensive Test Set",
+ results=[r1, r2],
+ config_used={"eval_param": "value"}
+ )
+ report.metadata = {"report_meta": "val"}
+ return report
+
+def test_colors_disable():
+ # Save all original attributes from Colors
+ orig_attrs = {k: getattr(Colors, k) for k in ["RED", "RESET", "BOLD", "DIM", "GREEN", "YELLOW", "BLUE", "MAGENTA", "CYAN", "WHITE", "BG_RED", "BG_GREEN"]}
+ try:
+ Colors.disable()
+ assert Colors.RED == ""
+ finally:
+ for k, v in orig_attrs.items():
+ setattr(Colors, k, v)
+
+def test_json_reporter_quick_save():
+ report = build_comprehensive_report()
+ with tempfile.TemporaryDirectory() as tmpdir:
+ path = Path(tmpdir) / "quick.json"
+ JSONReporter.quick_save(report, str(path))
+ assert path.exists()
+
+def test_json_reporter_generate():
+ report = build_comprehensive_report()
+ reporter = JSONReporter()
+
+ # 1. output_dir = None -> returns JSON string
+ json_str = reporter.generate(report)
+ assert "Comprehensive Test Set" in json_str
+
+ # 2. output_dir is provided -> saves to file and returns path
+ with tempfile.TemporaryDirectory() as tmpdir:
+ res_path = reporter.generate(report, output_dir=tmpdir)
+ assert Path(res_path).exists()
+ assert res_path.endswith("report.json")
+
+def test_json_reporter_exclusions():
+ report = build_comprehensive_report()
+
+ # Disable everything
+ reporter = JSONReporter(
+ include_details=False,
+ include_trajectory=False,
+ include_node_responses=False,
+ include_actual_response=False,
+ include_tool_call_details=False,
+ )
+ data = reporter.to_dict(report)
+
+ for result in data["results"]:
+ assert "actual_trajectory" not in result
+ assert "actual_tool_calls" not in result
+ assert "node_responses" not in result
+ assert "node_details" not in result
+ assert "actual_response" not in result
+ for cr in result.get("criterion_results", []):
+ assert "details" not in cr
+
+def test_junit_reporter_generate():
+ report = build_comprehensive_report()
+ reporter = JUnitXMLReporter()
+
+ # 1. output_dir = None -> returns XML string
+ xml_str = reporter.generate(report)
+ assert "Comprehensive Test Set" in xml_str
+
+ # 2. output_dir is provided -> saves to file and returns path
+ with tempfile.TemporaryDirectory() as tmpdir:
+ res_path = reporter.generate(report, output_dir=tmpdir)
+ assert Path(res_path).exists()
+ assert res_path.endswith("junit.xml")
+
+def test_junit_reporter_details():
+ report = build_comprehensive_report()
+ reporter = JUnitXMLReporter()
+ xml_str = reporter.to_xml(report)
+
+ assert "config_used" in xml_str
+ assert "report_meta" in xml_str
+ assert '' in xml_str
+ assert ' 127 for c in text):
+ raise UnicodeEncodeError("ascii", text, 0, len(text), "non-ascii")
+ return len(text)
+
+ mock_output.write.side_effect = write_side_effect
+
+ reporter = ConsoleReporter(use_color=False, output=mock_output)
+ reporter.report(report)
+
+def test_console_reporter_partial_stats(capsys):
+ cr_passed = CriterionResult.success(
+ criterion="yellow_crit",
+ score=0.9,
+ threshold=0.8,
+ )
+ cr_failed = CriterionResult.success(
+ criterion="yellow_crit",
+ score=0.5,
+ threshold=0.8,
+ )
+ r1 = EvalCaseResult.success(
+ eval_id="case1",
+ criterion_results=[cr_passed],
+ )
+ r2 = EvalCaseResult.success(
+ eval_id="case2",
+ criterion_results=[cr_failed],
+ )
+
+ report = EvalReport.create(
+ eval_set_id="partial_set",
+ results=[r1, r2],
+ )
+ report.summary.pass_rate = 0.5
+ report.summary.criterion_stats = {
+ "yellow_crit": {
+ "pass_rate": 0.7,
+ "avg_score": 0.75,
+ "passed": 7,
+ "total": 10,
+ },
+ "red_crit": {
+ "pass_rate": 0.3,
+ "avg_score": 0.35,
+ "passed": 3,
+ "total": 10,
+ }
+ }
+
+ reporter = ConsoleReporter(use_color=False, verbose=True)
+ reporter.report(report)
+
+ captured = capsys.readouterr()
+ assert "PARTIAL" in captured.out
+ assert "yellow_crit" in captured.out
+ assert "red_crit" in captured.out
+
+def test_console_reporter_comprehensive_printing(capsys):
+ report = build_comprehensive_report()
+
+ reporter = ConsoleReporter(
+ use_color=True,
+ verbose=True,
+ include_trajectory=True,
+ include_actual_response=True,
+ )
+ reporter.report(report)
+
+ captured = capsys.readouterr()
+ assert "Comprehensive Test Set" in captured.out
+ assert "Case One" in captured.out
+ assert "Case Two" in captured.out
+ assert "ERROR" in captured.out
+ assert "Case execution crash" in captured.out
+ assert "case_meta" in captured.out
+ assert "test_tool" in captured.out
+ assert "call_id" in captured.out
+ assert "simple_trajectory_step" in captured.out
+ assert "object_node" in captured.out
+ assert "hello from object node" in captured.out
+ assert "Turn 0:" in captured.out
+ assert "traj failed reason" in captured.out
+ assert "extra_detail" in captured.out
diff --git a/tests/evaluation/test_testing_fixtures.py b/tests/evaluation/test_testing_fixtures.py
new file mode 100644
index 0000000..d1b7de9
--- /dev/null
+++ b/tests/evaluation/test_testing_fixtures.py
@@ -0,0 +1,182 @@
+"""Tests for Pytest integration utilities in testing.py."""
+
+import pytest
+from unittest.mock import AsyncMock, MagicMock, patch
+from typing import Any
+from pathlib import Path
+
+from agentflow.qa.evaluation.testing import (
+ EvalTestCase,
+ eval_test,
+ assert_eval_passed,
+ assert_criterion_passed,
+ parametrize_eval_cases,
+ EvalFixtures,
+ EvalPlugin,
+ run_eval,
+ create_eval_app,
+ create_simple_eval_set,
+)
+from agentflow.qa.evaluation.eval_result import EvalReport
+from agentflow.core.graph.compiled_graph import CompiledGraph
+
+def test_eval_test_case_repr():
+ case = EvalTestCase(eval_id="id1", name="name1", description="desc1")
+ assert case.eval_id == "id1"
+ assert case.name == "name1"
+ assert case.description == "desc1"
+ assert repr(case) == "EvalTestCase(name1)"
+
+ case2 = EvalTestCase(eval_id="id2")
+ assert repr(case2) == "EvalTestCase(id2)"
+
+@pytest.mark.asyncio
+async def test_eval_test_decorator_success():
+ mock_report = MagicMock()
+ mock_report.summary.pass_rate = 1.0
+
+ mock_evaluator = MagicMock()
+ mock_evaluator.evaluate = AsyncMock(return_value=mock_report)
+
+ with patch("agentflow.qa.evaluation.AgentEvaluator", return_value=mock_evaluator) as mock_class:
+ with patch("agentflow.qa.evaluation.testing.Path.exists", return_value=True):
+ @eval_test(eval_file="dummy.json", threshold=0.8)
+ async def my_test():
+ return "graph", "collector"
+
+ await my_test()
+ mock_class.assert_called_once()
+ mock_evaluator.evaluate.assert_called_once_with("dummy.json", verbose=True)
+
+@pytest.mark.asyncio
+async def test_eval_test_decorator_skips():
+ import _pytest.outcomes
+
+ @eval_test(eval_file="dummy.json")
+ async def my_skip_test():
+ return None
+
+ with pytest.raises(_pytest.outcomes.Skipped):
+ await my_skip_test()
+
+@pytest.mark.asyncio
+async def test_eval_test_decorator_fails_invalid_return():
+ import _pytest.outcomes
+
+ @eval_test(eval_file="dummy.json")
+ async def my_fail_test():
+ return "not-a-tuple"
+
+ with pytest.raises(_pytest.outcomes.Failed):
+ await my_fail_test()
+
+@pytest.mark.asyncio
+async def test_eval_test_decorator_fails_threshold_not_met():
+ import _pytest.outcomes
+
+ mock_report = MagicMock()
+ mock_report.summary.pass_rate = 0.5
+ mock_report.failed_cases = [
+ MagicMock(eval_id="case_1", name="Case One", error="failed", failed_criteria=[])
+ ]
+
+ mock_evaluator = MagicMock()
+ mock_evaluator.evaluate = AsyncMock(return_value=mock_report)
+
+ with patch("agentflow.qa.evaluation.AgentEvaluator", return_value=mock_evaluator):
+ with patch("agentflow.qa.evaluation.testing.Path.exists", return_value=True):
+ @eval_test(eval_file="dummy.json", threshold=0.9)
+ async def my_fail_threshold_test():
+ return "graph", "collector"
+
+ with pytest.raises(_pytest.outcomes.Failed):
+ await my_fail_threshold_test()
+
+@pytest.mark.asyncio
+async def test_eval_test_decorator_auto_detect_path():
+ mock_report = MagicMock()
+ mock_report.summary.pass_rate = 1.0
+ mock_evaluator = MagicMock()
+ mock_evaluator.evaluate = AsyncMock(return_value=mock_report)
+
+ with patch("agentflow.qa.evaluation.AgentEvaluator", return_value=mock_evaluator):
+ with patch("agentflow.qa.evaluation.testing.Path.exists", return_value=True):
+ @eval_test()
+ async def test_my_custom_scenario():
+ return "graph", "collector"
+
+ await test_my_custom_scenario()
+
+def test_assert_eval_passed():
+ report = MagicMock()
+ report.summary.pass_rate = 0.9
+ report.failed_cases = [MagicMock(eval_id="c1", name="", failed_criteria=[])]
+
+ assert_eval_passed(report, min_pass_rate=0.8)
+
+ with pytest.raises(AssertionError, match="Evaluation pass rate 90.0% below threshold 95.0%"):
+ assert_eval_passed(report, min_pass_rate=0.95)
+
+def test_assert_criterion_passed():
+ report = MagicMock()
+ report.summary.criterion_stats = {
+ "accuracy": {"avg_score": 0.85}
+ }
+
+ with pytest.raises(AssertionError, match="Criterion 'safety' not found"):
+ assert_criterion_passed(report, "safety")
+
+ assert_criterion_passed(report, "accuracy", min_score=0.8)
+
+ with pytest.raises(AssertionError, match="Criterion 'accuracy' average score 0.85 below minimum 0.90"):
+ assert_criterion_passed(report, "accuracy", min_score=0.9)
+
+def test_parametrize_eval_cases():
+ mock_set = MagicMock()
+ mock_case = MagicMock(eval_id="case1")
+ mock_set.eval_cases = [mock_case]
+
+ with patch("agentflow.qa.evaluation.dataset.eval_set.EvalSet.from_file", return_value=mock_set) as mock_from_file:
+ decorator = parametrize_eval_cases("dummy_path.json")
+ assert decorator is not None
+ mock_from_file.assert_called_once_with("dummy_path.json")
+
+def test_eval_fixtures():
+ fixtures = EvalFixtures(default_config="my_config")
+ assert fixtures.default_config == "my_config"
+
+ with patch("agentflow.qa.evaluation.AgentEvaluator") as mock_eval:
+ factory = fixtures.evaluator_factory()
+ factory("graph", "collector")
+ mock_eval.assert_called_once_with("graph", "collector", config="my_config")
+
+def test_eval_plugin_noop():
+ plugin = EvalPlugin()
+ plugin.pytest_configure(None)
+ plugin.pytest_collection_modifyitems(None, None)
+
+@pytest.mark.asyncio
+async def test_run_eval_success():
+ with patch("agentflow.qa.evaluation.AgentEvaluator") as mock_eval_class:
+ mock_eval = mock_eval_class.return_value
+ mock_eval.evaluate = AsyncMock(return_value="eval_report")
+
+ res = await run_eval("graph", "collector", "path.json")
+ assert res == "eval_report"
+ mock_eval.evaluate.assert_called_once_with("path.json", verbose=False)
+
+def test_create_eval_app():
+ mock_graph = MagicMock()
+ mock_graph.compile.return_value = "compiled_app"
+
+ app, collector = create_eval_app(mock_graph)
+ assert app == "compiled_app"
+ assert collector is not None
+ mock_graph.compile.assert_called_once()
+
+def test_create_simple_eval_set():
+ eval_set = create_simple_eval_set("my_set_id", [("query", "expected", "test_name")])
+ assert eval_set.eval_set_id == "my_set_id"
+ assert eval_set.name == "my_set_id"
+ assert len(eval_set.eval_cases) == 1
+ assert eval_set.eval_cases[0].name == "test_name"
diff --git a/tests/evaluation/test_user_simulator.py b/tests/evaluation/test_user_simulator.py
new file mode 100644
index 0000000..79bfe88
--- /dev/null
+++ b/tests/evaluation/test_user_simulator.py
@@ -0,0 +1,235 @@
+"""Tests for UserSimulator and BatchSimulator."""
+
+import pytest
+from unittest.mock import AsyncMock, MagicMock, patch
+from typing import Any
+import json
+import uuid
+
+from agentflow.qa.evaluation.simulators.user_simulator import (
+ UserSimulator,
+ BatchSimulator,
+ ConversationScenario,
+ SimulationResult
+)
+from agentflow.qa.evaluation.config.eval_config import UserSimulatorConfig
+from agentflow.qa.evaluation.criteria.base import BaseCriterion, CriterionResult
+from agentflow.core.graph.compiled_graph import CompiledGraph
+from agentflow.core.state import Message
+
+class MockCriterion(BaseCriterion):
+ """Mock criterion for evaluation."""
+ def __init__(self, name="mock_criterion"):
+ super().__init__()
+ self.name = name
+
+ async def evaluate(self, execution_result: Any, eval_case: Any) -> CriterionResult:
+ return CriterionResult(
+ criterion=self.name,
+ score=0.9,
+ passed=True,
+ details={"notes": "good response"}
+ )
+
+@pytest.mark.asyncio
+async def test_user_simulator_init():
+ # 1. Init with defaults
+ sim = UserSimulator()
+ assert sim.model == "gemini/gemini-2.5-flash"
+ assert sim.temperature == 0.7
+ assert sim.max_turns == 10
+ assert sim.api_style == "responses"
+
+ # 2. Init with config
+ config = UserSimulatorConfig(
+ model="gpt-4o",
+ temperature=0.5,
+ max_invocations=5
+ )
+ # Mocking config object to have api_style
+ config.api_style = "chat"
+ sim_config = UserSimulator(config=config)
+ assert sim_config.model == "gpt-4o"
+ assert sim_config.temperature == 0.5
+ assert sim_config.max_turns == 5
+ assert sim_config.api_style == "chat"
+
+@pytest.mark.asyncio
+async def test_user_simulator_run_success():
+ # Prepare scenario
+ scenario = ConversationScenario(
+ scenario_id="test_scen",
+ description="A test scenario",
+ starting_prompt="Hello agent",
+ goals=["Say hello back"],
+ max_turns=3
+ )
+
+ # Mock CompiledGraph
+ mock_graph = MagicMock(spec=CompiledGraph)
+ msg = Message.text_message("Hello back!", role="assistant")
+ mock_graph.ainvoke = AsyncMock(return_value={"messages": [msg]})
+
+ # Mock call_llm response for goal check
+ mock_result_json = json.dumps({"achieved": True, "reasoning": "Agent said hello back"})
+
+ criterion = MockCriterion("test_criterion")
+ sim = UserSimulator(criteria=[criterion])
+
+ with patch("agentflow.qa.evaluation.simulators.user_simulator.call_llm") as mock_call:
+ mock_call.return_value = (mock_result_json, 10, 20, 0)
+
+ result = await sim.run(mock_graph, scenario)
+
+ assert result.completed is True
+ assert "Say hello back" in result.goals_achieved
+ assert result.turns == 1
+ assert result.criterion_scores["test_criterion"] == 0.9
+
+@pytest.mark.asyncio
+async def test_user_simulator_run_no_starting_prompt():
+ scenario = ConversationScenario(
+ scenario_id="test_scen_no_start",
+ description="Test no start",
+ starting_prompt="",
+ goals=["Finish"],
+ max_turns=2
+ )
+
+ mock_graph = MagicMock(spec=CompiledGraph)
+ msg = Message.text_message("Got it", role="assistant")
+ mock_graph.ainvoke = AsyncMock(return_value={"messages": [msg]})
+
+ sim = UserSimulator()
+
+ with patch("agentflow.qa.evaluation.simulators.user_simulator.call_llm") as mock_call:
+ mock_call.side_effect = [
+ ("Start prompt", 5, 5, 0),
+ (json.dumps({"achieved": True, "reasoning": "Done"}), 10, 10, 0)
+ ]
+
+ result = await sim.run(mock_graph, scenario)
+ assert result.completed is True
+ assert result.conversation[0]["content"] == "Start prompt"
+
+@pytest.mark.asyncio
+async def test_user_simulator_run_max_turns():
+ scenario = ConversationScenario(
+ scenario_id="test_max_turns",
+ description="Test max turns limit",
+ starting_prompt="Hello",
+ goals=["Goal never met"],
+ max_turns=2
+ )
+
+ mock_graph = MagicMock(spec=CompiledGraph)
+ msg = Message.text_message("Agent reply", role="assistant")
+ mock_graph.ainvoke = AsyncMock(return_value={"messages": [msg]})
+
+ sim = UserSimulator()
+
+ with patch("agentflow.qa.evaluation.simulators.user_simulator.call_llm") as mock_call:
+ mock_call.side_effect = [
+ (json.dumps({"achieved": False, "reasoning": "Not yet"}), 5, 5, 0),
+ ("User follow-up", 5, 5, 0),
+ (json.dumps({"achieved": False, "reasoning": "Still not yet"}), 5, 5, 0),
+ ("User second follow-up", 5, 5, 0),
+ ]
+
+ result = await sim.run(mock_graph, scenario)
+ assert result.completed is False
+ assert result.turns == 2
+ assert len(result.conversation) == 4
+
+@pytest.mark.asyncio
+async def test_user_simulator_graph_exception():
+ scenario = ConversationScenario(
+ scenario_id="test_graph_err",
+ starting_prompt="Hello",
+ goals=["Done"]
+ )
+ mock_graph = MagicMock(spec=CompiledGraph)
+ mock_graph.ainvoke = AsyncMock(side_effect=RuntimeError("Graph crash"))
+
+ sim = UserSimulator()
+ result = await sim.run(mock_graph, scenario)
+ assert result.completed is False
+ assert "Graph crash" in result.error
+
+@pytest.mark.asyncio
+async def test_user_simulator_check_goals_fallback():
+ sim = UserSimulator()
+ conversation = [
+ {"role": "user", "content": "hello"},
+ {"role": "assistant", "content": "the magic word is banana"}
+ ]
+
+ with patch("agentflow.qa.evaluation.simulators.user_simulator.call_llm", side_effect=Exception("LLM down")):
+ achieved, usage = await sim._check_goals(
+ all_goals=["banana", "apple"],
+ achieved=[],
+ conversation=conversation
+ )
+ assert "banana" in achieved
+ assert "apple" not in achieved
+
+@pytest.mark.asyncio
+async def test_user_simulator_general_exception():
+ scenario = ConversationScenario(scenario_id="crash")
+ sim = UserSimulator()
+ mock_graph = MagicMock(spec=CompiledGraph)
+ mock_graph.ainvoke = AsyncMock(return_value={"messages": []})
+
+ # Cause crash inside try block by raising TypeError in _generate_initial_message
+ with patch.object(sim, "_generate_initial_message", side_effect=TypeError("Crashed inside")):
+ result = await sim.run(mock_graph, scenario)
+ assert result.completed is False
+ assert "Crashed inside" in result.error
+
+def test_extract_response_fallback_paths():
+ sim = UserSimulator()
+ # 1. empty result
+ assert sim._extract_response({}) == ""
+ # 2. messages is empty
+ assert sim._extract_response({"messages": []}) == ""
+ # 3. assistant content block list
+ msg1 = type("Message", (), {
+ "role": "assistant",
+ "content": [
+ type("Block", (), {"text": "Hello"}),
+ type("Block", (), {})
+ ]
+ })
+ assert sim._extract_response({"messages": [msg1]}) == "Hello"
+
+ # 4. plain dict format
+ msg2 = {"role": "assistant", "content": "Dict text"}
+ assert sim._extract_response({"messages": [msg2]}) == "Dict text"
+
+@pytest.mark.asyncio
+async def test_batch_simulator():
+ sim = UserSimulator()
+ batch = BatchSimulator(simulator=sim, max_concurrency=2)
+
+ mock_graph = MagicMock(spec=CompiledGraph)
+ msg = Message.text_message("Done", role="assistant")
+ mock_graph.ainvoke = AsyncMock(return_value={"messages": [msg]})
+
+ scenarios = [
+ ConversationScenario(scenario_id="s1", starting_prompt="p1", goals=["g1"]),
+ ConversationScenario(scenario_id="s2", starting_prompt="p2", goals=["g2"])
+ ]
+
+ with patch("agentflow.qa.evaluation.simulators.user_simulator.call_llm") as mock_call:
+ mock_call.return_value = (json.dumps({"achieved": True, "reasoning": "ok"}), 1, 1, 0)
+
+ results = await batch.run_batch(mock_graph, scenarios)
+ assert len(results) == 2
+ assert results[0].scenario_id == "s1"
+ assert results[1].scenario_id == "s2"
+
+ summary = batch.summary(results)
+ assert summary["total_scenarios"] == 2
+ assert summary["completed"] == 2
+ assert summary["completion_rate"] == 1.0
+ assert summary["errors"] == 0
diff --git a/tests/graph/test_agent_internal.py b/tests/graph/test_agent_internal.py
index 4793423..ca4e4e8 100644
--- a/tests/graph/test_agent_internal.py
+++ b/tests/graph/test_agent_internal.py
@@ -30,6 +30,18 @@
)
from agentflow.storage.store.store_schema import MemorySearchResult, MemoryType
+from agentflow.core.graph.agent_internal.execution import (
+ _extract_cache_creation_tokens,
+ _extract_cache_read_tokens,
+ _extract_finish_reason,
+ _extract_input_tokens,
+ _extract_output_tokens,
+ _extract_reasoning_tokens,
+ _extract_response_id,
+ _extract_response_model,
+ _extract_response_text,
+)
+
# ─────────────────────────────────────────────────────────────────────────────
# Shared helpers
@@ -363,6 +375,210 @@ def test_returns_empty_string_for_empty_content(self):
assert agent._extract_prompt(messages) == ""
+# ═════════════════════════════════════════════════════════════════════════════
+# Token / response extraction helpers
+# ═════════════════════════════════════════════════════════════════════════════
+
+
+class TestExtractInputTokens:
+ def test_no_usage_returns_zero(self):
+ assert _extract_input_tokens(SimpleNamespace()) == 0
+
+ def test_prompt_tokens(self):
+ resp = SimpleNamespace(usage=SimpleNamespace(prompt_tokens=42))
+ assert _extract_input_tokens(resp) == 42
+
+ def test_input_tokens(self):
+ resp = SimpleNamespace(usage=SimpleNamespace(input_tokens=99))
+ assert _extract_input_tokens(resp) == 99
+
+ def test_prompt_tokens_preferred_over_input_tokens(self):
+ resp = SimpleNamespace(usage=SimpleNamespace(prompt_tokens=10, input_tokens=20))
+ assert _extract_input_tokens(resp) == 10
+
+
+class TestExtractOutputTokens:
+ def test_no_usage_returns_zero(self):
+ assert _extract_output_tokens(SimpleNamespace()) == 0
+
+ def test_completion_tokens(self):
+ resp = SimpleNamespace(usage=SimpleNamespace(completion_tokens=42))
+ assert _extract_output_tokens(resp) == 42
+
+ def test_output_tokens(self):
+ resp = SimpleNamespace(usage=SimpleNamespace(output_tokens=99))
+ assert _extract_output_tokens(resp) == 99
+
+ def test_completion_tokens_preferred_over_output_tokens(self):
+ resp = SimpleNamespace(usage=SimpleNamespace(completion_tokens=10, output_tokens=20))
+ assert _extract_output_tokens(resp) == 10
+
+
+class TestExtractCacheReadTokens:
+ def test_no_usage_returns_zero(self):
+ assert _extract_cache_read_tokens(SimpleNamespace()) == 0
+
+ def test_anthropic_cache_read(self):
+ resp = SimpleNamespace(usage=SimpleNamespace(cache_read_input_tokens=100))
+ assert _extract_cache_read_tokens(resp) == 100
+
+ def test_openai_chat_cached_tokens(self):
+ resp = SimpleNamespace(
+ usage=SimpleNamespace(prompt_tokens_details=SimpleNamespace(cached_tokens=50)),
+ )
+ assert _extract_cache_read_tokens(resp) == 50
+
+ def test_openai_responses_cached_tokens(self):
+ resp = SimpleNamespace(
+ usage=SimpleNamespace(input_tokens_details=SimpleNamespace(cached_tokens=75)),
+ )
+ assert _extract_cache_read_tokens(resp) == 75
+
+ def test_google_cached_content_token_count(self):
+ # Google responses have both usage (no cache fields) and usage_metadata
+ resp = SimpleNamespace(
+ usage=SimpleNamespace(prompt_tokens=5, completion_tokens=10),
+ usage_metadata=SimpleNamespace(cached_content_token_count=200),
+ )
+ assert _extract_cache_read_tokens(resp) == 200
+
+ def test_no_cache_hits_returns_zero(self):
+ resp = SimpleNamespace(usage=SimpleNamespace(prompt_tokens=5, completion_tokens=10))
+ assert _extract_cache_read_tokens(resp) == 0
+
+ def test_cache_value_zero_returns_zero(self):
+ resp = SimpleNamespace(usage=SimpleNamespace(cache_read_input_tokens=0))
+ assert _extract_cache_read_tokens(resp) == 0
+
+
+class TestExtractCacheCreationTokens:
+ def test_no_usage_returns_zero(self):
+ assert _extract_cache_creation_tokens(SimpleNamespace()) == 0
+
+ def test_cache_creation_tokens(self):
+ resp = SimpleNamespace(usage=SimpleNamespace(cache_creation_input_tokens=50))
+ assert _extract_cache_creation_tokens(resp) == 50
+
+ def test_cache_creation_none_returns_zero(self):
+ resp = SimpleNamespace(usage=SimpleNamespace(cache_creation_input_tokens=None))
+ assert _extract_cache_creation_tokens(resp) == 0
+
+ def test_cache_creation_zero_returns_zero(self):
+ resp = SimpleNamespace(usage=SimpleNamespace(cache_creation_input_tokens=0))
+ assert _extract_cache_creation_tokens(resp) == 0
+
+
+class TestExtractReasoningTokens:
+ def test_no_usage_returns_zero(self):
+ assert _extract_reasoning_tokens(SimpleNamespace()) == 0
+
+ def test_openai_completion_details(self):
+ resp = SimpleNamespace(
+ usage=SimpleNamespace(completion_tokens_details=SimpleNamespace(reasoning_tokens=42)),
+ )
+ assert _extract_reasoning_tokens(resp) == 42
+
+ def test_openai_responses_output_details(self):
+ resp = SimpleNamespace(
+ usage=SimpleNamespace(output_tokens_details=SimpleNamespace(reasoning_tokens=77)),
+ )
+ assert _extract_reasoning_tokens(resp) == 77
+
+ def test_google_thoughts_token_count(self):
+ # Google responses have both usage and usage_metadata
+ resp = SimpleNamespace(
+ usage=SimpleNamespace(completion_tokens=10),
+ usage_metadata=SimpleNamespace(thoughts_token_count=120),
+ )
+ assert _extract_reasoning_tokens(resp) == 120
+
+ def test_no_reasoning_tokens_returns_zero(self):
+ resp = SimpleNamespace(usage=SimpleNamespace(completion_tokens=10))
+ assert _extract_reasoning_tokens(resp) == 0
+
+
+class TestExtractFinishReason:
+ def test_openai_chat_choices_finish_reason(self):
+ resp = SimpleNamespace(choices=[SimpleNamespace(finish_reason="stop")])
+ assert _extract_finish_reason(resp) == "stop"
+
+ def test_openai_responses_status(self):
+ resp = SimpleNamespace(status="completed")
+ assert _extract_finish_reason(resp) == "completed"
+
+ def test_status_in_progress_is_skipped(self):
+ resp = SimpleNamespace(status="in_progress")
+ assert _extract_finish_reason(resp) == ""
+
+ def test_anthropic_stop_reason(self):
+ resp = SimpleNamespace(stop_reason="end_turn")
+ assert _extract_finish_reason(resp) == "end_turn"
+
+ def test_google_candidates_finish_reason_with_name(self):
+ resp = SimpleNamespace(
+ candidates=[SimpleNamespace(finish_reason=SimpleNamespace(name="STOP"))],
+ )
+ assert _extract_finish_reason(resp) == "STOP"
+
+ def test_google_candidates_int_finish_reason(self):
+ resp = SimpleNamespace(candidates=[SimpleNamespace(finish_reason=1)])
+ assert _extract_finish_reason(resp) == "1"
+
+ def test_no_match_returns_empty(self):
+ assert _extract_finish_reason(SimpleNamespace()) == ""
+
+ def test_empty_status_string_skipped(self):
+ resp = SimpleNamespace(status="")
+ assert _extract_finish_reason(resp) == ""
+
+
+class TestExtractResponseId:
+ def test_with_id(self):
+ resp = SimpleNamespace(id="chatcmpl-abc123")
+ assert _extract_response_id(resp) == "chatcmpl-abc123"
+
+ def test_no_id_returns_empty(self):
+ assert _extract_response_id(SimpleNamespace()) == ""
+
+
+class TestExtractResponseModel:
+ def test_with_model(self):
+ resp = SimpleNamespace(model="gpt-4o")
+ assert _extract_response_model(resp) == "gpt-4o"
+
+ def test_no_model_returns_empty(self):
+ assert _extract_response_model(SimpleNamespace()) == ""
+
+
+class TestExtractResponseText:
+ def test_text_attribute(self):
+ resp = SimpleNamespace(text=" Hello world ")
+ assert _extract_response_text(resp) == "Hello world"
+
+ def test_output_text_attribute(self):
+ resp = SimpleNamespace(output_text=" Hi there ")
+ assert _extract_response_text(resp) == "Hi there"
+
+ def test_choices_content(self):
+ resp = SimpleNamespace(choices=[SimpleNamespace(message=SimpleNamespace(content="From choice"))])
+ assert _extract_response_text(resp) == "From choice"
+
+ def test_no_text_found_returns_empty(self):
+ assert _extract_response_text(SimpleNamespace()) == ""
+
+ def test_text_preferred_over_choices(self):
+ resp = SimpleNamespace(text="Direct", choices=[SimpleNamespace(message=SimpleNamespace(content="Choice"))])
+ assert _extract_response_text(resp) == "Direct"
+
+ def test_output_text_preferred_over_choices(self):
+ resp = SimpleNamespace(output_text="Output", choices=[SimpleNamespace(message=SimpleNamespace(content="Choice"))])
+ assert _extract_response_text(resp) == "Output"
+
+ def test_choices_empty_content_returns_empty(self):
+ resp = SimpleNamespace(choices=[SimpleNamespace(message=SimpleNamespace(content=""))])
+ assert _extract_response_text(resp) == ""
+
+
# ═════════════════════════════════════════════════════════════════════════════
# AgentOpenAIMixin – _call_openai
# ═════════════════════════════════════════════════════════════════════════════
@@ -1014,6 +1230,118 @@ async def test_unsupported_output_type_raises(self):
await agent._call_google([{"role": "user", "content": "hi"}])
+# ═════════════════════════════════════════════════════════════════════════════
+# AgentExecutionMixin – _call_llm (provider routing)
+# ═════════════════════════════════════════════════════════════════════════════
+
+
+@pytest.mark.asyncio
+class TestCallLLM:
+ async def test_openai_chat_style(self):
+ agent = _make_openai_agent(api_style="chat")
+ agent._call_openai = AsyncMock(return_value="chat_response")
+ result = await agent._call_llm([{"role": "user", "content": "hi"}])
+ assert result == "chat_response"
+ assert agent._effective_api_style == "chat"
+ agent._call_openai.assert_awaited_once()
+
+ async def test_openai_responses_style(self):
+ agent = _make_openai_agent(api_style="responses", model="o4-mini")
+ agent._call_openai_responses = AsyncMock(return_value="resp_response")
+ result = await agent._call_llm([{"role": "user", "content": "hi"}])
+ assert result == "resp_response"
+ assert agent._effective_api_style == "responses"
+ agent._call_openai_responses.assert_awaited_once()
+
+ async def test_openai_responses_with_output_schema_falls_back_to_chat(self):
+ agent = _make_openai_agent(api_style="responses", model="o4-mini", output_schema={"type": "object", "properties": {"answer": {"type": "string"}}})
+ agent._call_openai = AsyncMock(return_value="chat_response")
+ result = await agent._call_llm([{"role": "user", "content": "hi"}])
+ assert result == "chat_response"
+ assert agent._effective_api_style == "chat"
+ agent._call_openai.assert_awaited_once()
+
+ async def test_openai_responses_with_base_url_succeeds(self):
+ agent = _make_openai_agent(api_style="responses", model="o4-mini", base_url="http://localhost:8000/v1")
+ agent._call_openai_responses = AsyncMock(return_value="resp_response")
+ result = await agent._call_llm([{"role": "user", "content": "hi"}])
+ assert result == "resp_response"
+ assert agent._effective_api_style == "responses"
+ agent._call_openai_responses.assert_awaited_once()
+
+ async def test_openai_responses_with_base_url_fallback_on_error(self):
+ agent = _make_openai_agent(api_style="responses", model="o4-mini", base_url="http://localhost:8000/v1")
+ agent._call_openai_responses = AsyncMock(side_effect=Exception("not supported"))
+ agent._call_openai = AsyncMock(return_value="fallback_chat")
+ result = await agent._call_llm([{"role": "user", "content": "hi"}])
+ assert result == "fallback_chat"
+ assert agent._effective_api_style == "chat"
+ agent._call_openai_responses.assert_awaited_once()
+ agent._call_openai.assert_awaited_once()
+
+ async def test_openai_chat_with_reasoning_effort(self):
+ agent = _make_openai_agent(api_style="chat", reasoning_config={"effort": "high"})
+ agent._call_openai = AsyncMock(return_value="response")
+ await agent._call_llm([{"role": "user", "content": "hi"}])
+ call_kwargs = agent._call_openai.call_args[1]
+ assert call_kwargs.get("reasoning_effort") == "high"
+
+ async def test_openai_chat_with_reasoning_config_and_base_url(self):
+ agent = _make_openai_agent(api_style="chat", reasoning_config={"effort": "medium"}, base_url="http://localhost:8000/v1")
+ agent._call_openai = AsyncMock(return_value="response")
+ await agent._call_llm([{"role": "user", "content": "hi"}])
+ call_kwargs = agent._call_openai.call_args[1]
+ extra_body = call_kwargs.get("extra_body", {})
+ assert "reasoning" in extra_body
+ assert extra_body["reasoning"] == {"effort": "medium"}
+
+ async def test_google_provider(self):
+ agent = _make_google_agent()
+ agent._call_google = AsyncMock(return_value="google_response")
+ result = await agent._call_llm([{"role": "user", "content": "hi"}])
+ assert result == "google_response"
+ agent._call_google.assert_awaited_once()
+
+ async def test_unsupported_provider_raises(self):
+ agent = _make_openai_agent()
+ agent.provider = "unsupported"
+ with pytest.raises(ValueError, match="Unsupported provider"):
+ await agent._call_llm([{"role": "user", "content": "hi"}])
+
+ async def test_openai_responses_output_schema_with_reasoning_and_base_url(self):
+ """Cover lines 406-410: output_schema + reasoning_config + base_url."""
+ agent = _make_openai_agent(
+ api_style="responses",
+ model="o4-mini",
+ output_schema={"type": "object", "properties": {"a": {"type": "string"}}},
+ reasoning_config={"effort": "high"},
+ base_url="http://localhost:8000/v1",
+ )
+ agent._call_openai = AsyncMock(return_value="chat_result")
+ result = await agent._call_llm([{"role": "user", "content": "hi"}])
+ assert result == "chat_result"
+ assert agent._effective_api_style == "chat"
+ call_kwargs = agent._call_openai.call_args[1]
+ assert call_kwargs.get("reasoning_effort") == "high"
+ extra_body = call_kwargs.get("extra_body", {})
+ assert extra_body.get("reasoning") == {"effort": "high"}
+
+ async def test_openai_responses_base_url_fallback_with_reasoning(self):
+ """Cover line 429: reasoning_effort in base_url fallback path."""
+ agent = _make_openai_agent(
+ api_style="responses",
+ model="o4-mini",
+ reasoning_config={"effort": "low"},
+ base_url="http://localhost:8000/v1",
+ )
+ agent._call_openai_responses = AsyncMock(side_effect=Exception("fail"))
+ agent._call_openai = AsyncMock(return_value="fallback")
+ result = await agent._call_llm([{"role": "user", "content": "hi"}])
+ assert result == "fallback"
+ call_kwargs = agent._call_openai.call_args[1]
+ assert call_kwargs.get("reasoning_effort") == "low"
+
+
# ═════════════════════════════════════════════════════════════════════════════
# AgentExecutionMixin – _setup_tools
# ═════════════════════════════════════════════════════════════════════════════
@@ -1187,6 +1515,413 @@ def my_tool(x: str) -> str:
assert names_first.count("my_tool") == 1
assert names_second.count("my_tool") == 1
+ async def test_named_node_not_found_raises(self):
+ """container.call_factory returning None should raise RuntimeError."""
+ agent = _make_openai_agent(tool_node="MISSING")
+ agent._tool_node = None
+ agent.tool_node_name = "MISSING"
+
+ container = MagicMock()
+ container.call_factory.return_value = None
+
+ with pytest.raises(RuntimeError, match="ToolNode named 'MISSING' was not found"):
+ await agent._resolve_tools(container)
+
+ async def test_named_node_not_tool_node_raises(self):
+ """Resolved node whose func is not a ToolNode should raise."""
+ agent = _make_openai_agent(tool_node="NOT_TOOL")
+ agent._tool_node = None
+ agent.tool_node_name = "NOT_TOOL"
+
+ fake_node = MagicMock()
+ fake_node.func = "not_a_tool_node" # not a ToolNode instance
+
+ container = MagicMock()
+ container.call_factory.return_value = fake_node
+
+ with pytest.raises(RuntimeError, match="not a ToolNode"):
+ await agent._resolve_tools(container)
+
+ async def test_named_node_key_error_raises(self):
+ """Cover lines 719-720: container.call_factory raises KeyError."""
+ from injectq.utils.exceptions import DependencyNotFoundError
+
+ agent = _make_openai_agent(tool_node="MISSING")
+ agent._tool_node = None
+ agent.tool_node_name = "MISSING"
+
+ container = MagicMock()
+ container.call_factory.side_effect = KeyError("get_node")
+
+ with pytest.raises(RuntimeError, match="ToolNode named 'MISSING' was not found"):
+ await agent._resolve_tools(container)
+
+ async def test_named_node_dependency_not_found_raises(self):
+ """Cover lines 719-720: container.call_factory raises DependencyNotFoundError."""
+ from injectq.utils.exceptions import DependencyNotFoundError
+
+ agent = _make_openai_agent(tool_node="MISSING")
+ agent._tool_node = None
+ agent.tool_node_name = "MISSING"
+
+ container = MagicMock()
+ container.call_factory.side_effect = DependencyNotFoundError("get_node")
+
+ with pytest.raises(RuntimeError, match="ToolNode named 'MISSING' was not found"):
+ await agent._resolve_tools(container)
+
+
+# ═════════════════════════════════════════════════════════════════════════════
+# AgentExecutionMixin – _resolve_media_in_messages
+# ═════════════════════════════════════════════════════════════════════════════
+
+
+@pytest.mark.asyncio
+class TestResolveMediaInMessages:
+ async def test_no_media_store_returns_messages_unchanged(self):
+ agent = _make_openai_agent()
+ agent.media_store = None
+ messages = [{"role": "user", "content": "hello"}]
+ result = await agent._resolve_media_in_messages(messages)
+ assert result is messages
+
+ async def test_non_list_content_skipped(self):
+ agent = _make_openai_agent()
+ agent.media_store = MagicMock()
+ messages = [{"role": "user", "content": "plain text"}]
+ result = await agent._resolve_media_in_messages(messages)
+ assert result is messages
+
+ async def test_non_agentflow_url_not_touched(self):
+ agent = _make_openai_agent()
+ agent.media_store = MagicMock()
+ messages = [{"role": "user", "content": [{"type": "image_url", "image_url": {"url": "https://example.com/img.png"}}]}]
+ result = await agent._resolve_media_in_messages(messages)
+ assert result[0]["content"][0]["image_url"]["url"] == "https://example.com/img.png"
+
+ async def test_openai_image_ref_resolved(self):
+ agent = _make_openai_agent()
+ agent.media_store = MagicMock()
+ agent.model = "gpt-4o"
+ agent.provider = "openai"
+
+ resolved = {"type": "image_url", "image_url": {"url": "data:image/png;base64,abc123"}}
+ mock_resolver = AsyncMock()
+ mock_resolver.resolve_for_openai = AsyncMock(return_value=resolved)
+
+ messages = [{"role": "user", "content": [{"type": "image_url", "image_url": {"url": "agentflow://media/img_1", "mime_type": "image/png"}}]}]
+
+ with patch("agentflow.storage.media.resolver.MediaRefResolver", return_value=mock_resolver):
+ result = await agent._resolve_media_in_messages(messages)
+
+ assert result[0]["content"][0] is resolved
+ mock_resolver.resolve_for_openai.assert_awaited_once()
+
+ async def test_google_image_ref_with_inline_data(self):
+ agent = _make_openai_agent()
+ agent.media_store = MagicMock()
+ agent.model = "gemini-2.0-flash"
+ agent.provider = "google"
+
+ resolved = SimpleNamespace(
+ inline_data=SimpleNamespace(
+ data=b"\x89PNG\r\n\x1a\n",
+ mime_type="image/png",
+ ),
+ )
+
+ mock_resolver = AsyncMock()
+ mock_resolver.resolve_for_google = AsyncMock(return_value=resolved)
+
+ messages = [{"role": "user", "content": [{"type": "image_url", "image_url": {"url": "agentflow://media/img_g", "mime_type": "image/png"}}]}]
+
+ with patch("agentflow.storage.media.resolver.MediaRefResolver", return_value=mock_resolver):
+ result = await agent._resolve_media_in_messages(messages)
+
+ content = result[0]["content"][0]
+ assert content["type"] == "image_url"
+ assert content["image_url"]["url"].startswith("data:image/png;base64,")
+
+ async def test_google_image_ref_with_file_data(self):
+ agent = _make_openai_agent()
+ agent.media_store = MagicMock()
+ agent.model = "gemini-2.0-flash"
+ agent.provider = "google"
+
+ resolved = SimpleNamespace(
+ file_data=SimpleNamespace(file_uri="https://storage.googleapis.com/bucket/file"),
+ )
+
+ mock_resolver = AsyncMock()
+ mock_resolver.resolve_for_google = AsyncMock(return_value=resolved)
+
+ messages = [{"role": "user", "content": [{"type": "image_url", "image_url": {"url": "agentflow://media/img_fd", "mime_type": "image/png"}}]}]
+
+ with patch("agentflow.storage.media.resolver.MediaRefResolver", return_value=mock_resolver):
+ result = await agent._resolve_media_in_messages(messages)
+
+ content = result[0]["content"][0]
+ assert content["type"] == "image_url"
+ assert content["image_url"]["url"] == "https://storage.googleapis.com/bucket/file"
+
+ async def test_google_image_ref_resolve_failure_logged(self):
+ agent = _make_openai_agent()
+ agent.media_store = MagicMock()
+ agent.model = "gemini-2.0-flash"
+ agent.provider = "google"
+
+ mock_resolver = AsyncMock()
+ mock_resolver.resolve_for_google = AsyncMock(side_effect=ValueError("API error"))
+
+ messages = [{"role": "user", "content": [{"type": "image_url", "image_url": {"url": "agentflow://media/img_fail", "mime_type": "image/png"}}]}]
+
+ with patch("agentflow.storage.media.resolver.MediaRefResolver", return_value=mock_resolver):
+ result = await agent._resolve_media_in_messages(messages)
+
+ # Content unchanged on error
+ assert result[0]["content"][0]["image_url"]["url"] == "agentflow://media/img_fail"
+
+ async def test_model_with_provider_prefix_stripped(self):
+ agent = _make_openai_agent()
+ agent.media_store = MagicMock()
+ agent.model = "openai/gpt-4o"
+ agent.provider = "openai"
+
+ resolved = {"type": "image_url", "image_url": {"url": "data:...;base64,xyz"}}
+ mock_resolver = AsyncMock()
+ mock_resolver.resolve_for_openai = AsyncMock(return_value=resolved)
+
+ messages = [{"role": "user", "content": [{"type": "image_url", "image_url": {"url": "agentflow://media/img_pfx", "mime_type": "image/png"}}]}]
+
+ with patch("agentflow.storage.media.resolver.MediaRefResolver", return_value=mock_resolver):
+ await agent._resolve_media_in_messages(messages)
+
+ # Model passed to resolver should not have the prefix
+ call_model = mock_resolver.resolve_for_openai.call_args[1].get("model")
+ assert call_model == "gpt-4o"
+ assert call_model != "openai/gpt-4o"
+
+ async def test_document_type_openai_resolved(self):
+ agent = _make_openai_agent()
+ agent.media_store = MagicMock()
+ agent.model = "gpt-4o"
+ agent.provider = "openai"
+
+ resolved = {"image_url": {"url": "data:application/pdf;base64,pdfdata"}}
+ mock_resolver = AsyncMock()
+ mock_resolver.resolve_for_openai = AsyncMock(return_value=resolved)
+
+ messages = [{"role": "user", "content": [{"type": "document", "document": {"url": "agentflow://media/doc_1", "mime_type": "application/pdf"}}]}]
+
+ with patch("agentflow.storage.media.resolver.MediaRefResolver", return_value=mock_resolver):
+ result = await agent._resolve_media_in_messages(messages)
+
+ content = result[0]["content"][0]
+ assert content["type"] == "document"
+ assert content["document"]["url"] == "data:application/pdf;base64,pdfdata"
+
+ async def test_video_type_google_resolved(self):
+ agent = _make_openai_agent()
+ agent.media_store = MagicMock()
+ agent.model = "gemini-2.0-flash"
+ agent.provider = "google"
+
+ resolved = SimpleNamespace(
+ inline_data=SimpleNamespace(
+ data=b"videodata",
+ mime_type="video/mp4",
+ ),
+ )
+
+ mock_resolver = AsyncMock()
+ mock_resolver.resolve_for_google = AsyncMock(return_value=resolved)
+
+ messages = [{"role": "user", "content": [{"type": "video", "video": {"url": "agentflow://media/vid_1", "mime_type": "video/mp4"}}]}]
+
+ with patch("agentflow.storage.media.resolver.MediaRefResolver", return_value=mock_resolver):
+ result = await agent._resolve_media_in_messages(messages)
+
+ content = result[0]["content"][0]
+ assert content["type"] == "video"
+ assert content["video"]["mime_type"] == "video/mp4"
+
+ async def test_media_store_from_container_when_not_on_self(self):
+ agent = _make_openai_agent()
+ # Remove media_store from agent so it falls back to container lookup
+ if hasattr(agent, "media_store"):
+ del agent.media_store
+
+ mock_media_store = MagicMock()
+ messages = [{"role": "user", "content": "hello"}]
+
+ with patch("agentflow.core.graph.agent_internal.execution.InjectQ") as mock_injectq:
+ instance = mock_injectq.get_instance.return_value
+ instance.try_get.side_effect = lambda key: mock_media_store if key == "media_store" else None
+ result = await agent._resolve_media_in_messages(messages)
+
+ assert result is messages # no agentflow refs, so unchanged
+
+ async def test_non_dict_part_skipped(self):
+ """Cover line 595: part in content list is not a dict -> continue."""
+ agent = _make_openai_agent()
+ agent.media_store = MagicMock()
+ messages = [{"role": "user", "content": ["not_a_dict", {"type": "image_url", "image_url": {"url": "https://example.com/img.png"}}]}]
+ result = await agent._resolve_media_in_messages(messages)
+ # Non-dict parts passed through unchanged
+ assert result[0]["content"][0] == "not_a_dict"
+
+ async def test_document_type_openai_resolve_failure(self):
+ """Cover lines 696-697: exception during document/video resolve."""
+ agent = _make_openai_agent()
+ agent.media_store = MagicMock()
+ agent.model = "gpt-4o"
+ agent.provider = "openai"
+
+ mock_resolver = AsyncMock()
+ mock_resolver.resolve_for_openai = AsyncMock(side_effect=ValueError("doc resolve error"))
+
+ messages = [{"role": "user", "content": [{"type": "document", "document": {"url": "agentflow://media/doc_fail", "mime_type": "application/pdf"}}]}]
+
+ with patch("agentflow.storage.media.resolver.MediaRefResolver", return_value=mock_resolver):
+ result = await agent._resolve_media_in_messages(messages)
+
+ assert result[0]["content"][0]["document"]["url"] == "agentflow://media/doc_fail"
+
+
+# ═════════════════════════════════════════════════════════════════════════════
+# AgentExecutionMixin – execute
+# ═════════════════════════════════════════════════════════════════════════════
+
+
+@pytest.mark.asyncio
+class TestExecute:
+ async def test_basic_execute_text(self):
+ agent = _make_openai_agent()
+ state = AgentState()
+ config = {"_node_name": "AGENT", "is_stream": False}
+
+ mock_response = SimpleNamespace(
+ id="resp_1",
+ model="gpt-4o",
+ text="Hello world",
+ usage=SimpleNamespace(prompt_tokens=10, completion_tokens=20),
+ )
+
+ agent._trim_context = AsyncMock(return_value=state)
+ agent._resolve_media_in_messages = AsyncMock(side_effect=lambda msgs: msgs)
+ agent._resolve_tools = AsyncMock(return_value=[])
+ agent._call_llm_with_retry = AsyncMock(return_value=mock_response)
+ agent._build_skill_prompts = MagicMock(return_value=[])
+ agent._build_memory_prompts = AsyncMock(return_value=[])
+
+ with patch("agentflow.core.graph.agent_internal.execution.convert_messages", return_value=[{"role": "user", "content": "hi"}]), \
+ patch("agentflow.core.graph.agent_internal.execution.strip_media_blocks", side_effect=lambda msgs: msgs), \
+ patch("agentflow.runtime.publisher.publish.publish_event") as mock_publish, \
+ patch("agentflow.core.graph.agent_internal.execution.ModelResponseConverter") as mock_converter:
+
+ mock_converter_instance = MagicMock()
+ mock_converter.return_value = mock_converter_instance
+
+ result = await agent.execute(state, config)
+
+ assert result is mock_converter_instance
+ agent._call_llm_with_retry.assert_awaited_once_with(
+ messages=[{"role": "user", "content": "hi"}],
+ tools=None,
+ stream=False,
+ )
+ mock_converter.assert_called_once()
+ assert mock_converter.call_args.args[0] is mock_response
+ assert mock_converter.call_args.kwargs.get("converter") is not None
+ # START + END events published
+ assert mock_publish.call_count >= 2
+
+ async def test_execute_stream_no_end_event(self):
+ agent = _make_openai_agent()
+ state = AgentState()
+ config = {"_node_name": "AGENT", "is_stream": True}
+
+ mock_response = MagicMock()
+ agent._trim_context = AsyncMock(return_value=state)
+ agent._resolve_media_in_messages = AsyncMock(side_effect=lambda msgs: msgs)
+ agent._resolve_tools = AsyncMock(return_value=[])
+ agent._call_llm_with_retry = AsyncMock(return_value=mock_response)
+ agent._build_skill_prompts = MagicMock(return_value=[])
+ agent._build_memory_prompts = AsyncMock(return_value=[])
+
+ with patch("agentflow.core.graph.agent_internal.execution.convert_messages", return_value=[]), \
+ patch("agentflow.core.graph.agent_internal.execution.strip_media_blocks", side_effect=lambda msgs: msgs), \
+ patch("agentflow.runtime.publisher.publish.publish_event") as mock_publish, \
+ patch("agentflow.core.graph.agent_internal.execution.ModelResponseConverter") as mock_converter:
+
+ await agent.execute(state, config)
+
+ # Only START event — no END event for streams
+ assert mock_publish.call_count == 1
+
+ async def test_execute_with_multimodal_config_skips_strip(self):
+ agent = _make_openai_agent(multimodal_config=MagicMock())
+ state = AgentState()
+ config = {"_node_name": "AGENT", "is_stream": False}
+
+ mock_response = SimpleNamespace(
+ id="r1", model="gpt-4o", text="ok",
+ usage=SimpleNamespace(prompt_tokens=1, completion_tokens=1),
+ )
+
+ agent._trim_context = AsyncMock(return_value=state)
+ agent._resolve_media_in_messages = AsyncMock(side_effect=lambda msgs: msgs)
+ agent._resolve_tools = AsyncMock(return_value=[])
+ agent._call_llm_with_retry = AsyncMock(return_value=mock_response)
+
+ strip_called = []
+
+ def tracking_strip(msgs):
+ strip_called.append(True)
+ return msgs
+
+ with patch("agentflow.core.graph.agent_internal.execution.convert_messages", return_value=[]), \
+ patch("agentflow.core.graph.agent_internal.execution.strip_media_blocks", side_effect=tracking_strip), \
+ patch("agentflow.runtime.publisher.publish.publish_event"), \
+ patch("agentflow.core.graph.agent_internal.execution.ModelResponseConverter"):
+
+ await agent.execute(state, config)
+
+ # strip_media_blocks should NOT be called when multimodal_config is set
+ assert len(strip_called) == 0
+
+ async def test_execute_with_tools(self):
+ agent = _make_openai_agent()
+ state = AgentState()
+ config = {"_node_name": "AGENT", "is_stream": False}
+
+ mock_response = SimpleNamespace(
+ id="r2", model="gpt-4o", text="tool result",
+ usage=SimpleNamespace(prompt_tokens=5, completion_tokens=15),
+ )
+
+ fake_tools = [{"function": {"name": "my_tool"}}]
+
+ agent._trim_context = AsyncMock(return_value=state)
+ agent._resolve_media_in_messages = AsyncMock(side_effect=lambda msgs: msgs)
+ agent._resolve_tools = AsyncMock(return_value=fake_tools)
+ agent._call_llm_with_retry = AsyncMock(return_value=mock_response)
+ agent._build_skill_prompts = MagicMock(return_value=[])
+ agent._build_memory_prompts = AsyncMock(return_value=[])
+
+ with patch("agentflow.core.graph.agent_internal.execution.convert_messages", return_value=[]), \
+ patch("agentflow.core.graph.agent_internal.execution.strip_media_blocks", side_effect=lambda msgs: msgs), \
+ patch("agentflow.runtime.publisher.publish.publish_event"), \
+ patch("agentflow.core.graph.agent_internal.execution.ModelResponseConverter"):
+
+ await agent.execute(state, config)
+
+ agent._call_llm_with_retry.assert_awaited_once_with(
+ messages=[],
+ tools=fake_tools,
+ stream=False,
+ )
+
# ═════════════════════════════════════════════════════════════════════════════
# Agent.__init__ – construction edge cases
diff --git a/tests/graph/test_agent_retry_fallback.py b/tests/graph/test_agent_retry_fallback.py
index 3040e02..f17ce1b 100644
--- a/tests/graph/test_agent_retry_fallback.py
+++ b/tests/graph/test_agent_retry_fallback.py
@@ -229,6 +229,23 @@ def test_string_fallback_429(self):
exc = Exception("Rate limited: 429 Too Many Requests")
assert agent._extract_status_code(exc) == 429
+ def test_non_integer_code_returns_none(self):
+ agent = _make_agent()
+ exc = Exception("some error")
+ exc.code = "UNKNOWN"
+ assert agent._extract_status_code(exc) is None
+
+ def test_none_code_returns_none(self):
+ agent = _make_agent()
+ exc = Exception("some error")
+ exc.code = None
+ assert agent._extract_status_code(exc) is None
+
+ def test_string_with_no_code_returns_none(self):
+ agent = _make_agent()
+ exc = Exception("Something went wrong without a status code")
+ assert agent._extract_status_code(exc) is None
+
# ═════════════════════════════════════════════════════════════════════════════
# _is_retryable_error
diff --git a/tests/graph/test_token_tracking.py b/tests/graph/test_token_tracking.py
new file mode 100644
index 0000000..f8eef3e
--- /dev/null
+++ b/tests/graph/test_token_tracking.py
@@ -0,0 +1,346 @@
+"""Tests for token tracking functionality in graph execution handlers.
+
+This module tests:
+- Token usage calculation from agent state
+- Token tracking in StreamHandler
+- Token tracking in InvokeHandler
+"""
+
+import pytest
+
+from agentflow.core.graph import StateGraph
+from agentflow.core.graph.utils.utils import calculate_token_usage
+from agentflow.core.state import AgentState, Message, TokenUsages
+from agentflow.core.state.message_block import TextBlock
+from agentflow.utils import END, ResponseGranularity
+
+
+class TestCalculateTokenUsage:
+ """Test the calculate_token_usage utility function."""
+
+ def test_empty_context(self):
+ """Test token calculation with no messages."""
+ state = AgentState()
+ usage = calculate_token_usage(state.context if state.context else [])
+
+ assert usage["total_input_tokens"] == 0 # noqa: S101
+ assert usage["total_output_tokens"] == 0 # noqa: S101
+ assert usage["total_reasoning_tokens"] == 0 # noqa: S101
+ assert usage["total_tokens"] == 0 # noqa: S101
+
+ def test_single_message_with_usage(self):
+ """Test token calculation with a single message."""
+ message = Message(
+ role="assistant",
+ content=[TextBlock(text="Hello")],
+ usages=TokenUsages(
+ prompt_tokens=10,
+ completion_tokens=20,
+ total_tokens=30,
+ reasoning_tokens=5,
+ ),
+ )
+
+ state = AgentState(context=[message])
+ usage = calculate_token_usage(state.context)
+
+ assert usage["total_input_tokens"] == 10 # noqa: S101
+ assert usage["total_output_tokens"] == 20 # noqa: S101
+ assert usage["total_reasoning_tokens"] == 5 # noqa: S101
+ assert usage["total_tokens"] == 30 # noqa: S101
+
+ def test_multiple_messages_with_usage(self):
+ """Test token calculation with multiple messages."""
+ messages = [
+ Message(
+ role="user",
+ content=[TextBlock(text="Hello")],
+ usages=TokenUsages(
+ prompt_tokens=5,
+ completion_tokens=0,
+ total_tokens=5,
+ reasoning_tokens=0,
+ ),
+ ),
+ Message(
+ role="assistant",
+ content=[TextBlock(text="Hi there!")],
+ usages=TokenUsages(
+ prompt_tokens=10,
+ completion_tokens=15,
+ total_tokens=25,
+ reasoning_tokens=3,
+ ),
+ ),
+ Message(
+ role="user",
+ content=[TextBlock(text="How are you?")],
+ usages=TokenUsages(
+ prompt_tokens=8,
+ completion_tokens=0,
+ total_tokens=8,
+ reasoning_tokens=0,
+ ),
+ ),
+ Message(
+ role="assistant",
+ content=[TextBlock(text="I'm doing well, thanks!")],
+ usages=TokenUsages(
+ prompt_tokens=12,
+ completion_tokens=20,
+ total_tokens=32,
+ reasoning_tokens=5,
+ ),
+ ),
+ ]
+
+ state = AgentState(context=messages)
+ usage = calculate_token_usage(state.context)
+
+ # Total: 5 + 10 + 8 + 12 = 35 input tokens
+ assert usage["total_input_tokens"] == 35 # noqa: S101
+ # Total: 0 + 15 + 0 + 20 = 35 output tokens
+ assert usage["total_output_tokens"] == 35 # noqa: S101
+ # Total: 0 + 3 + 0 + 5 = 8 reasoning tokens
+ assert usage["total_reasoning_tokens"] == 8 # noqa: S101
+ # Total: 35 + 35 = 70 tokens
+ assert usage["total_tokens"] == 70 # noqa: S101
+
+ def test_messages_without_usage(self):
+ """Test token calculation with messages that don't have usage data."""
+ messages = [
+ Message(
+ role="user",
+ content=[TextBlock(text="Hello")],
+ usages=None,
+ ),
+ Message(
+ role="assistant",
+ content=[TextBlock(text="Hi there!")],
+ usages=TokenUsages(
+ prompt_tokens=10,
+ completion_tokens=15,
+ total_tokens=25,
+ reasoning_tokens=3,
+ ),
+ ),
+ ]
+
+ state = AgentState(context=messages)
+ usage = calculate_token_usage(state.context)
+
+ # Only the second message has usage
+ assert usage["total_input_tokens"] == 10 # noqa: S101
+ assert usage["total_output_tokens"] == 15 # noqa: S101
+ assert usage["total_reasoning_tokens"] == 3 # noqa: S101
+ assert usage["total_tokens"] == 25 # noqa: S101
+
+ def test_mixed_messages_with_and_without_usage(self):
+ """Test token calculation with mix of messages with and without usage."""
+ messages = [
+ Message(role="user", content=[TextBlock(text="Hello")], usages=None),
+ Message(
+ role="assistant",
+ content=[TextBlock(text="Hi")],
+ usages=TokenUsages(
+ prompt_tokens=10, completion_tokens=5, total_tokens=15, reasoning_tokens=2
+ ),
+ ),
+ Message(role="user", content=[TextBlock(text="Thanks")], usages=None),
+ Message(
+ role="assistant",
+ content=[TextBlock(text="You're welcome")],
+ usages=TokenUsages(
+ prompt_tokens=8, completion_tokens=12, total_tokens=20, reasoning_tokens=1
+ ),
+ ),
+ ]
+
+ state = AgentState(context=messages)
+ usage = calculate_token_usage(state.context)
+
+ assert usage["total_input_tokens"] == 18 # noqa: S101
+ assert usage["total_output_tokens"] == 17 # noqa: S101
+ assert usage["total_reasoning_tokens"] == 3 # noqa: S101
+ assert usage["total_tokens"] == 35 # noqa: S101
+
+
+@pytest.mark.asyncio
+class TestStreamHandlerTokenTracking:
+ """Test token tracking in StreamHandler during graph execution."""
+
+ async def test_stream_handler_includes_token_usage_in_final_chunk(self):
+ """Test that StreamHandler includes token usage in final stream chunk."""
+ from agentflow.core.state.stream_chunks import StreamEvent
+
+ # Define a simple node that returns a message with token usage
+ def agent_node(state: AgentState, config: dict) -> list[Message]:
+ return [
+ Message(
+ role="assistant",
+ content=[TextBlock(text="Hello from agent")],
+ usages=TokenUsages(
+ prompt_tokens=100,
+ completion_tokens=50,
+ total_tokens=150,
+ reasoning_tokens=10,
+ ),
+ )
+ ]
+
+ # Build and compile graph
+ graph = StateGraph(AgentState)
+ graph.add_node("agent", agent_node)
+ graph.set_entry_point("agent")
+ graph.add_edge("agent", END)
+ compiled_graph = graph.compile()
+
+ # Execute with streaming
+ chunks = []
+ async for chunk in compiled_graph.astream(
+ {"messages": [Message(role="user", content=[TextBlock(text="Hi")])]},
+ response_granularity=ResponseGranularity.FULL,
+ ):
+ chunks.append(chunk)
+
+ # Find the final UPDATES chunk
+ final_chunks = [c for c in chunks if c.event == StreamEvent.UPDATES and c.data.get("status") == "graph_invoked"]
+ assert len(final_chunks) > 0 # noqa: S101
+
+ final_chunk = final_chunks[-1]
+
+ # Verify token usage is included
+ assert "total_input_tokens" in final_chunk.data # noqa: S101
+ assert "total_output_tokens" in final_chunk.data # noqa: S101
+ assert "total_reasoning_tokens" in final_chunk.data # noqa: S101
+ assert "total_tokens" in final_chunk.data # noqa: S101
+
+ # Verify token counts (should match what we set in the message)
+ assert final_chunk.data["total_input_tokens"] == 100 # noqa: S101
+ assert final_chunk.data["total_output_tokens"] == 50 # noqa: S101
+ assert final_chunk.data["total_reasoning_tokens"] == 10 # noqa: S101
+ assert final_chunk.data["total_tokens"] == 150 # noqa: S101
+
+
+@pytest.mark.asyncio
+class TestInvokeHandlerTokenTracking:
+ """Test token tracking in InvokeHandler during graph execution."""
+
+ async def test_invoke_handler_includes_token_usage_in_response(self):
+ """Test that InvokeHandler includes token usage in response metadata."""
+
+ # Define a simple node that returns a message with token usage
+ def agent_node(state: AgentState, config: dict) -> list[Message]:
+ return [
+ Message(
+ role="assistant",
+ content=[TextBlock(text="Response from agent")],
+ usages=TokenUsages(
+ prompt_tokens=200,
+ completion_tokens=100,
+ total_tokens=300,
+ reasoning_tokens=20,
+ ),
+ )
+ ]
+
+ # Build and compile graph
+ graph = StateGraph(AgentState)
+ graph.add_node("agent", agent_node)
+ graph.set_entry_point("agent")
+ graph.add_edge("agent", END)
+ compiled_graph = graph.compile()
+
+ # Execute with invoke
+ result = await compiled_graph.ainvoke(
+ {"messages": [Message(role="user", content=[TextBlock(text="Hi")])]},
+ response_granularity=ResponseGranularity.FULL,
+ )
+
+ # The result should contain the final state
+ assert "state" in result # noqa: S101
+ assert "token_usage" in result # noqa: S101
+
+ # Verify token counts match what we set
+ token_usage = result["token_usage"]
+ assert token_usage["total_input_tokens"] == 200 # noqa: S101
+ assert token_usage["total_output_tokens"] == 100 # noqa: S101
+ assert token_usage["total_reasoning_tokens"] == 20 # noqa: S101
+ assert token_usage["total_tokens"] == 300 # noqa: S101 # input + output only
+
+
+@pytest.mark.asyncio
+class TestMultipleNodesTokenTracking:
+ """Test token tracking across multiple nodes in a graph."""
+
+ async def test_token_accumulation_across_nodes(self):
+ """Test that tokens accumulate correctly across multiple nodes."""
+
+ def node1(state: AgentState, config: dict) -> list[Message]:
+ return [
+ Message(
+ role="assistant",
+ content=[TextBlock(text="Node 1 response")],
+ usages=TokenUsages(
+ prompt_tokens=50,
+ completion_tokens=30,
+ total_tokens=80,
+ reasoning_tokens=5,
+ ),
+ )
+ ]
+
+ def node2(state: AgentState, config: dict) -> list[Message]:
+ return [
+ Message(
+ role="assistant",
+ content=[TextBlock(text="Node 2 response")],
+ usages=TokenUsages(
+ prompt_tokens=60,
+ completion_tokens=40,
+ total_tokens=100,
+ reasoning_tokens=8,
+ ),
+ )
+ ]
+
+ def node3(state: AgentState, config: dict) -> list[Message]:
+ return [
+ Message(
+ role="assistant",
+ content=[TextBlock(text="Node 3 response")],
+ usages=TokenUsages(
+ prompt_tokens=70,
+ completion_tokens=50,
+ total_tokens=120,
+ reasoning_tokens=10,
+ ),
+ )
+ ]
+
+ # Build graph with multiple nodes
+ graph = StateGraph(AgentState)
+ graph.add_node("node1", node1)
+ graph.add_node("node2", node2)
+ graph.add_node("node3", node3)
+ graph.set_entry_point("node1")
+ graph.add_edge("node1", "node2")
+ graph.add_edge("node2", "node3")
+ graph.add_edge("node3", END)
+ compiled_graph = graph.compile()
+
+ # Execute
+ result = await compiled_graph.ainvoke(
+ {"messages": [Message(role="user", content=[TextBlock(text="Start")])]},
+ response_granularity=ResponseGranularity.FULL,
+ )
+
+ # Verify token usage is included in the result
+ assert "token_usage" in result # noqa: S101
+ token_usage = result["token_usage"]
+
+ # Verify accumulated tokens (50 + 60 + 70 = 180 input, 30 + 40 + 50 = 120 output, 5 + 8 + 10 = 23 reasoning)
+ assert token_usage["total_input_tokens"] == 180 # noqa: S101
+ assert token_usage["total_output_tokens"] == 120 # noqa: S101
+ assert token_usage["total_reasoning_tokens"] == 23 # noqa: S101
+ assert token_usage["total_tokens"] == 300 # noqa: S101 # input + output only
diff --git a/tests/publisher/test_composite_publisher.py b/tests/publisher/test_composite_publisher.py
new file mode 100644
index 0000000..3a24107
--- /dev/null
+++ b/tests/publisher/test_composite_publisher.py
@@ -0,0 +1,43 @@
+import pytest
+from unittest.mock import AsyncMock, MagicMock
+from agentflow.runtime.publisher.composite_publisher import CompositePublisher
+from agentflow.runtime.publisher.base_publisher import BasePublisher
+from agentflow.runtime.publisher.events import EventModel
+
+
+@pytest.mark.asyncio
+async def test_composite_publisher_lifecycle():
+ pub1 = AsyncMock(spec=BasePublisher)
+ pub2 = AsyncMock(spec=BasePublisher)
+
+ # Initialize
+ composite = CompositePublisher([pub1])
+ assert pub1 in composite._publishers
+ assert pub2 not in composite._publishers
+
+ # Add publisher
+ composite.add_publisher(pub2)
+ assert pub2 in composite._publishers
+
+ # Publish
+ event = MagicMock(spec=EventModel)
+ await composite.publish(event)
+ pub1.publish.assert_called_once_with(event)
+ pub2.publish.assert_called_once_with(event)
+
+ # Close
+ await composite.close()
+ pub1.close.assert_called_once()
+ pub2.close.assert_called_once()
+
+ # Sync close
+ pub1.sync_close = MagicMock()
+ pub2.sync_close = MagicMock()
+ composite.sync_close()
+ pub1.sync_close.assert_called_once()
+ pub2.sync_close.assert_called_once()
+
+ # Remove publisher
+ composite.remove_publisher(pub1)
+ assert pub1 not in composite._publishers
+ assert pub2 in composite._publishers
diff --git a/tests/publisher/test_optional_publishers.py b/tests/publisher/test_optional_publishers.py
index 05e9592..3f20b14 100644
--- a/tests/publisher/test_optional_publishers.py
+++ b/tests/publisher/test_optional_publishers.py
@@ -411,6 +411,84 @@ async def test_kafka_publisher_publish_success(self, mock_import):
assert parsed_message["event"] == "tool_execution"
assert parsed_message["node_name"] == "kafka_test"
+ @pytest.mark.asyncio
+ async def test_kafka_publisher_closed_errors(self):
+ """Test KafkaPublisher closed errors."""
+ from agentflow.runtime.publisher.kafka_publisher import KafkaPublisher
+ publisher = KafkaPublisher()
+ publisher._is_closed = True
+
+ with pytest.raises(RuntimeError, match="KafkaPublisher is closed"):
+ await publisher._get_producer()
+
+ with pytest.raises(RuntimeError, match="Cannot publish to closed KafkaPublisher"):
+ await publisher.publish(EventModel(event=Event.GRAPH_EXECUTION, event_type=EventType.START))
+
+ @pytest.mark.asyncio
+ @patch('importlib.import_module')
+ async def test_kafka_publisher_get_producer_early_return(self, mock_import):
+ """Test early return in _get_producer if already initialized."""
+ from agentflow.runtime.publisher.kafka_publisher import KafkaPublisher
+ publisher = KafkaPublisher()
+ publisher._producer = MagicMock()
+
+ res = await publisher._get_producer()
+ assert res == publisher._producer
+ mock_import.assert_not_called()
+
+ @pytest.mark.asyncio
+ async def test_kafka_publisher_missing_module(self):
+ """Test missing aiokafka ImportError fallback."""
+ from agentflow.runtime.publisher.kafka_publisher import KafkaPublisher
+ publisher = KafkaPublisher()
+
+ with patch('agentflow.runtime.publisher.kafka_publisher.importlib.import_module', side_effect=ImportError):
+ with pytest.raises(RuntimeError, match="requires the 'aiokafka' package"):
+ await publisher._get_producer()
+
+ @pytest.mark.asyncio
+ async def test_kafka_publisher_close_methods(self):
+ """Test close() and sync_close() paths."""
+ from agentflow.runtime.publisher.kafka_publisher import KafkaPublisher
+ publisher = KafkaPublisher()
+
+ # Closed early return
+ publisher._is_closed = True
+ await publisher.close()
+
+ # Close with producer
+ publisher._is_closed = False
+ mock_producer = AsyncMock()
+ publisher._producer = mock_producer
+
+ await publisher.close()
+ mock_producer.stop.assert_called_once()
+ assert publisher._producer is None
+ assert publisher._is_closed is True
+
+ # Close with exception in stop
+ publisher = KafkaPublisher()
+ mock_producer = AsyncMock()
+ mock_producer.stop.side_effect = Exception("stop failed")
+ publisher._producer = mock_producer
+
+ await publisher.close()
+ assert publisher._producer is None
+ assert publisher._is_closed is True
+
+ # Test sync_close
+ publisher = KafkaPublisher()
+ publisher.close = AsyncMock()
+ publisher.sync_close()
+ publisher.close.assert_called_once()
+
+ # Test sync_close active loop warning
+ publisher = KafkaPublisher()
+ with patch('asyncio.run', side_effect=RuntimeError):
+ with patch('agentflow.runtime.publisher.kafka_publisher.logger.warning') as mock_warn:
+ publisher.sync_close()
+ mock_warn.assert_called_once()
+
class TestRabbitMQPublisher:
"""Test RabbitMQPublisher with mocked dependencies."""
@@ -504,6 +582,124 @@ async def test_rabbitmq_publisher_publish_success(self, mock_import):
# Verify message creation - Mock should be called as a constructor
assert mock_pika.Message.called
+
+ @pytest.mark.asyncio
+ async def test_rabbitmq_publisher_closed_errors(self):
+ """Test RabbitMQPublisher raising RuntimeError when closed."""
+ from agentflow.runtime.publisher.rabbitmq_publisher import RabbitMQPublisher
+ publisher = RabbitMQPublisher()
+ publisher._is_closed = True
+
+ with pytest.raises(RuntimeError, match="RabbitMQPublisher is closed"):
+ await publisher._ensure()
+
+ with pytest.raises(RuntimeError, match="Cannot publish to closed RabbitMQPublisher"):
+ await publisher.publish(EventModel(event=Event.GRAPH_EXECUTION, event_type=EventType.START))
+
+ @pytest.mark.asyncio
+ @patch('importlib.import_module')
+ async def test_rabbitmq_publisher_ensure_early_return(self, mock_import):
+ """Test early return in _ensure when exchange is already declared."""
+ from agentflow.runtime.publisher.rabbitmq_publisher import RabbitMQPublisher
+ publisher = RabbitMQPublisher()
+ publisher._exchange = MagicMock()
+
+ await publisher._ensure()
+ mock_import.assert_not_called()
+
+ @pytest.mark.asyncio
+ async def test_rabbitmq_publisher_missing_module(self):
+ """Test RuntimeError when aio_pika module is missing."""
+ from agentflow.runtime.publisher.rabbitmq_publisher import RabbitMQPublisher
+ publisher = RabbitMQPublisher()
+
+ with patch('agentflow.runtime.publisher.rabbitmq_publisher.importlib.import_module', side_effect=ImportError):
+ with pytest.raises(RuntimeError, match="requires the 'aio-pika' package"):
+ await publisher._ensure()
+
+ @pytest.mark.asyncio
+ @patch('importlib.import_module')
+ async def test_rabbitmq_publisher_default_exchange_fallback(self, mock_import):
+ """Test default exchange declaration fallback when declare=False."""
+ mock_pika = Mock()
+ mock_connection = AsyncMock()
+ mock_channel = AsyncMock()
+
+ mock_pika.connect_robust = AsyncMock(return_value=mock_connection)
+ mock_connection.channel = AsyncMock(return_value=mock_channel)
+ mock_import.return_value = mock_pika
+
+ with patch.dict('sys.modules', {'aio_pika': mock_pika}):
+ from agentflow.runtime.publisher.rabbitmq_publisher import RabbitMQPublisher
+ publisher = RabbitMQPublisher({"declare": False})
+
+ await publisher._ensure()
+ assert publisher._exchange == mock_channel.default_exchange
+
+ @pytest.mark.asyncio
+ @patch('importlib.import_module')
+ async def test_rabbitmq_publisher_publish_not_initialized(self, mock_import):
+ """Test publish raises RuntimeError when exchange is not initialized."""
+ mock_pika = Mock()
+ mock_import.return_value = mock_pika
+ from agentflow.runtime.publisher.rabbitmq_publisher import RabbitMQPublisher
+ publisher = RabbitMQPublisher()
+
+ # Mock _ensure to do nothing so _exchange remains None
+ with patch.object(publisher, '_ensure', AsyncMock()):
+ with pytest.raises(RuntimeError, match="exchange not initialized"):
+ await publisher.publish(EventModel(event=Event.GRAPH_EXECUTION, event_type=EventType.START))
+
+ @pytest.mark.asyncio
+ async def test_rabbitmq_publisher_close_methods(self):
+ """Test close() and sync_close() paths."""
+ from agentflow.runtime.publisher.rabbitmq_publisher import RabbitMQPublisher
+ publisher = RabbitMQPublisher()
+
+ # Close when already closed
+ publisher._is_closed = True
+ await publisher.close() # Should return early
+
+ # Close with active channel and connection
+ publisher._is_closed = False
+ mock_channel = AsyncMock()
+ mock_conn = AsyncMock()
+ publisher._channel = mock_channel
+ publisher._conn = mock_conn
+
+ await publisher.close()
+ mock_channel.close.assert_called_once()
+ mock_conn.close.assert_called_once()
+ assert publisher._channel is None
+ assert publisher._conn is None
+ assert publisher._is_closed is True
+
+ # Test close with exceptions on close call (should catch and log)
+ publisher = RabbitMQPublisher()
+ publisher._channel = AsyncMock()
+ publisher._channel.close.side_effect = Exception("channel close failed")
+ publisher._conn = AsyncMock()
+ publisher._conn.close.side_effect = Exception("conn close failed")
+
+ await publisher.close()
+ assert publisher._channel is None
+ assert publisher._conn is None
+ assert publisher._is_closed is True
+
+ # Test sync_close
+ publisher = RabbitMQPublisher()
+ publisher.close = AsyncMock()
+ publisher.sync_close()
+ publisher.close.assert_called_once()
+
+ # Test sync_close raises RuntimeError (active loop)
+ publisher = RabbitMQPublisher()
+ with patch('asyncio.run', side_effect=RuntimeError("active loop")):
+ with patch('agentflow.runtime.publisher.rabbitmq_publisher.logger.warning') as mock_warn:
+ publisher.sync_close()
+ mock_warn.assert_called_once_with("sync_close called within an active event loop; skipping.")
+
+
class TestOptionalPublisherErrorHandling:
"""Test error handling across optional publishers."""
diff --git a/tests/publisher/test_publish.py b/tests/publisher/test_publish.py
new file mode 100644
index 0000000..29d4f12
--- /dev/null
+++ b/tests/publisher/test_publish.py
@@ -0,0 +1,42 @@
+import pytest
+from unittest.mock import AsyncMock, MagicMock
+from agentflow.runtime.publisher.publish import _publish_event_task, publish_event
+from agentflow.runtime.publisher.events import EventModel
+from agentflow.runtime.publisher.base_publisher import BasePublisher
+from agentflow.utils.background_task_manager import BackgroundTaskManager
+
+
+@pytest.mark.asyncio
+async def test_publish_event_task_success():
+ event = MagicMock(spec=EventModel)
+ publisher = AsyncMock(spec=BasePublisher)
+
+ await _publish_event_task(event, publisher)
+
+ publisher.publish.assert_called_once_with(event)
+
+
+@pytest.mark.asyncio
+async def test_publish_event_task_failure():
+ event = MagicMock(spec=EventModel)
+ publisher = AsyncMock(spec=BasePublisher)
+ publisher.publish.side_effect = RuntimeError("publish boom")
+
+ await _publish_event_task(event, publisher)
+
+ publisher.publish.assert_called_once_with(event)
+
+
+@pytest.mark.asyncio
+async def test_publish_event_task_no_publisher():
+ event = MagicMock(spec=EventModel)
+ await _publish_event_task(event, None)
+
+
+def test_publish_event():
+ event = MagicMock(spec=EventModel)
+ publisher = MagicMock(spec=BasePublisher)
+ task_manager = MagicMock(spec=BackgroundTaskManager)
+
+ publish_event(event, publisher, task_manager)
+ task_manager.create_task.assert_called_once()
diff --git a/tests/storage/media/test_base_media_store.py b/tests/storage/media/test_base_media_store.py
new file mode 100644
index 0000000..fb7ba8a
--- /dev/null
+++ b/tests/storage/media/test_base_media_store.py
@@ -0,0 +1,58 @@
+import pytest
+from typing import Any
+from agentflow.storage.media.storage.base import BaseMediaStore
+from agentflow.core.state.message_block import MediaRef
+
+class ConcreteMediaStore(BaseMediaStore):
+ """A concrete implementation of BaseMediaStore for testing."""
+ def __init__(self):
+ self.store_dict = {}
+
+ async def store(self, data: bytes, mime_type: str, metadata: dict[str, Any] | None = None) -> str:
+ key = f"key_{len(self.store_dict)}"
+ self.store_dict[key] = (data, mime_type)
+ return key
+
+ async def retrieve(self, storage_key: str) -> tuple[bytes, str]:
+ if storage_key not in self.store_dict:
+ raise KeyError(f"Key {storage_key} not found")
+ return self.store_dict[storage_key]
+
+ async def delete(self, storage_key: str) -> bool:
+ if storage_key in self.store_dict:
+ del self.store_dict[storage_key]
+ return True
+ return False
+
+ async def exists(self, storage_key: str) -> bool:
+ return storage_key in self.store_dict
+
+@pytest.mark.asyncio
+async def test_get_metadata_success():
+ store = ConcreteMediaStore()
+ key = await store.store(b"hello", "text/plain")
+ metadata = await store.get_metadata(key)
+ assert metadata == {
+ "mime_type": "text/plain",
+ "size_bytes": 5
+ }
+
+@pytest.mark.asyncio
+async def test_get_metadata_key_error():
+ store = ConcreteMediaStore()
+ metadata = await store.get_metadata("nonexistent")
+ assert metadata is None
+
+@pytest.mark.asyncio
+async def test_get_direct_url_default():
+ store = ConcreteMediaStore()
+ url = await store.get_direct_url("key")
+ assert url is None
+
+def test_to_media_ref():
+ store = ConcreteMediaStore()
+ ref = store.to_media_ref("key", "image/png")
+ assert isinstance(ref, MediaRef)
+ assert ref.kind == "url"
+ assert ref.url == "agentflow://media/key"
+ assert ref.mime_type == "image/png"
diff --git a/tests/storage/media/test_media_resolver_extra.py b/tests/storage/media/test_media_resolver_extra.py
index 14bff5f..85b2e5f 100644
--- a/tests/storage/media/test_media_resolver_extra.py
+++ b/tests/storage/media/test_media_resolver_extra.py
@@ -284,3 +284,132 @@ async def test_retrieve_bytes_invalid_kind_raises():
def test_openai_image_url_helper_shape():
part = _openai_image_url("https://example.com/x.png")
assert part == {"type": "image_url", "image_url": {"url": "https://example.com/x.png"}}
+
+
+def test_with_cache_configures_resolver():
+ from agentflow.storage.media.resolver import MediaRefResolver
+ resolver = MediaRefResolver()
+ cache = object()
+ out = resolver.with_cache(cache, expiration_seconds=1800, refresh_buffer_seconds=30)
+ assert out is resolver
+ assert resolver.cache_backend is cache
+ assert resolver.direct_url_expiration_seconds == 1800
+ assert resolver.direct_url_refresh_buffer_seconds == 30
+
+
+@pytest.mark.asyncio
+async def test_resolve_openai_legacy_fallback_reftypes_and_empty():
+ from agentflow.storage.media.resolver import MediaRefResolver
+ resolver = MediaRefResolver()
+ ref_empty = MediaRef.model_construct(kind="unknown")
+ res = await resolver._resolve_openai_legacy(ref_empty)
+ assert res == {"type": "image_url", "image_url": {"url": ""}}
+
+
+@pytest.mark.asyncio
+async def test_resolve_google_legacy_various_refs():
+ from agentflow.storage.media.resolver import MediaRefResolver
+ resolver = MediaRefResolver(media_store=_Store())
+
+ class _Part:
+ @staticmethod
+ def from_uri(file_uri, mime_type):
+ return {"uri": file_uri, "mime": mime_type}
+
+ @staticmethod
+ def from_bytes(data, mime_type):
+ return {"data": data, "mime": mime_type}
+
+ def __init__(self, **kwargs):
+ self.kwargs = kwargs
+
+ with patch("google.genai.types.Part", _Part):
+ ref_internal = MediaRef(kind="url", url="agentflow://media/k")
+ part1 = await resolver._resolve_google_legacy(ref_internal)
+ assert part1 == {"uri": "https://signed.example/k.png", "mime": "application/octet-stream"}
+
+ resolver.media_store.url_map["k"] = None
+ part2 = await resolver._resolve_google_legacy(ref_internal)
+ assert part2 == {"data": b"abc", "mime": "image/png"}
+
+ ref_external = MediaRef(kind="url", url="https://example.com/y.jpg", mime_type="image/jpeg")
+ part3 = await resolver._resolve_google_legacy(ref_external)
+ assert part3 == {"uri": "https://example.com/y.jpg", "mime": "image/jpeg"}
+
+ ref_data = MediaRef(kind="data", data_base64=base64.b64encode(b"hello").decode(), mime_type="text/plain")
+ part4 = await resolver._resolve_google_legacy(ref_data)
+ assert part4 == {"data": b"hello", "mime": "text/plain"}
+
+ ref_file = MediaRef(kind="file_id", file_id="file-123", mime_type="image/png")
+ part5 = await resolver._resolve_google_legacy(ref_file)
+ assert isinstance(part5, _Part)
+ assert part5.kwargs["file_data"].file_uri == "file-123"
+
+ ref_unres = MediaRef.model_construct(kind="unknown")
+ part6 = await resolver._resolve_google_legacy(ref_unres)
+ assert isinstance(part6, _Part)
+ assert part6.kwargs["text"] == "[Unresolvable media reference]"
+
+
+@pytest.mark.asyncio
+async def test_try_transport_modes():
+ from agentflow.storage.media.resolver import MediaRefResolver
+ resolver = MediaRefResolver()
+
+ result = await resolver._try_transport(MediaRef(kind="url"), MediaTransportMode.provider_file, "openai", object())
+ assert result is None
+
+ class _Part:
+ @staticmethod
+ def from_uri(file_uri, mime_type):
+ return {"uri": file_uri, "mime": mime_type}
+
+ resolver.media_store = _Store()
+ caps = type("Caps", (), {"can_convert_internal_to_remote": True})()
+ with patch("google.genai.types.Part", _Part):
+ res = await resolver._transport_remote_url(
+ MediaRef(kind="url", url="agentflow://media/k", mime_type="image/png"),
+ caps,
+ provider="google"
+ )
+ assert res == {"uri": "https://signed.example/k.png", "mime": "image/png"}
+
+
+@pytest.mark.asyncio
+async def test_transport_inline_bytes_url_retrieve_fail():
+ from agentflow.storage.media.resolver import MediaRefResolver
+ resolver = MediaRefResolver()
+ async def _fail(*args):
+ raise ValueError("fetch fail")
+ resolver._retrieve_bytes = _fail
+ res = await resolver._transport_inline_bytes(MediaRef(kind="url", url="https://x"), "openai")
+ assert res is None
+
+
+@pytest.mark.asyncio
+async def test_get_cached_signed_url_expired_or_invalid():
+ from agentflow.storage.media.resolver import MediaRefResolver
+ resolver = MediaRefResolver(cache_backend=_Cache())
+ resolver.cache_backend.values[("media:signed-url", "k")] = "not-a-dict"
+ res1 = await resolver._get_cached_signed_url("k")
+ assert res1 is None
+
+ resolver.cache_backend.values[("media:signed-url", "k")] = {"url": "https://x"}
+ res2 = await resolver._get_cached_signed_url("k")
+ assert res2 is None
+
+ resolver.cache_backend.values[("media:signed-url", "k")] = {"url": "https://x", "expires_at": 100}
+ resolver.direct_url_refresh_buffer_seconds = 60
+ res3 = await resolver._get_cached_signed_url("k")
+ assert res3 is None
+
+
+def test_source_kind_helper_variations():
+ from agentflow.storage.media.resolver import _source_kind
+ assert _source_kind(MediaRef(kind="url", url="agentflow://media/k")) == "internal_ref"
+ assert _source_kind(MediaRef(kind="url", url="https://x")) == "url"
+ assert _source_kind(MediaRef(kind="data")) == "data"
+ assert _source_kind(MediaRef(kind="file_id")) == "file_id"
+ assert _source_kind(MediaRef.model_construct(kind="other")) == "other"
+
+
diff --git a/tests/storage/media/test_provider_media.py b/tests/storage/media/test_provider_media.py
new file mode 100644
index 0000000..7a03614
--- /dev/null
+++ b/tests/storage/media/test_provider_media.py
@@ -0,0 +1,108 @@
+import pytest
+from unittest.mock import MagicMock, AsyncMock, patch
+from agentflow.storage.media.provider_media import (
+ ProviderMediaCache,
+ should_use_google_file_api,
+ prepare_google_content_part,
+ upload_to_google_file_api,
+ create_openai_file_search_tool,
+ create_openai_file_attachment
+)
+
+def test_provider_media_cache():
+ cache = ProviderMediaCache(max_entries=2)
+ key1 = cache.content_key(b"data1")
+ key2 = cache.content_key(b"data2")
+ key3 = cache.content_key(b"data3")
+
+ cache.put("google", key1, "ref1")
+ cache.put("google", key2, "ref2")
+ assert cache.get("google", key1) == "ref1"
+ assert cache.get("google", key2) == "ref2"
+
+ # Test eviction: key1 (oldest) should be evicted when key3 is added
+ cache.put("google", key3, "ref3")
+ assert cache.get("google", key1) is None
+ assert cache.get("google", key2) == "ref2"
+ assert cache.get("google", key3) == "ref3"
+
+ # Test clear for specific provider
+ cache.clear("google")
+ assert cache.get("google", key2) is None
+
+ # Test clear for all
+ cache.put("google", key2, "ref2")
+ cache.clear()
+ assert cache.get("google", key2) is None
+
+def test_should_use_google_file_api():
+ # threshold is 20MB
+ assert should_use_google_file_api(10 * 1024 * 1024) is False
+ assert should_use_google_file_api(25 * 1024 * 1024) is True
+
+def test_prepare_google_content_part():
+ class _Part:
+ @staticmethod
+ def from_bytes(data, mime_type):
+ return {"data": data, "mime": mime_type}
+
+ with patch("google.genai.types.Part", _Part):
+ # Under threshold
+ res = prepare_google_content_part(b"abc", "image/png")
+ assert res == {"data": b"abc", "mime": "image/png"}
+
+ # Over threshold
+ with pytest.raises(ValueError):
+ prepare_google_content_part(b"abc" * 10 * 1024 * 1024, "image/png")
+
+@pytest.mark.asyncio
+async def test_upload_to_google_file_api():
+ class _Part:
+ @staticmethod
+ def from_uri(file_uri, mime_type):
+ return {"uri": file_uri, "mime": mime_type}
+
+ mock_client = MagicMock()
+ mock_upload_res = MagicMock()
+ mock_upload_res.uri = "gs://test-bucket/file-1"
+ mock_upload_res.mime_type = "image/png"
+ mock_client.files.upload.return_value = mock_upload_res
+
+ # Mock google.genai.types.Part
+ with patch("google.genai.types.Part", _Part):
+ # 1. Successful upload without cache
+ res = await upload_to_google_file_api(b"data", "image/png", client=mock_client)
+ assert res == {"uri": "gs://test-bucket/file-1", "mime": "image/png"}
+
+ # 2. Caching logic (hit and miss)
+ cache = ProviderMediaCache()
+ res_uncached = await upload_to_google_file_api(b"data", "image/png", cache=cache, client=mock_client)
+ assert res_uncached == {"uri": "gs://test-bucket/file-1", "mime": "image/png"}
+
+ # Call again -> should hit cache and not call upload
+ mock_client.files.upload.reset_mock()
+ res_cached = await upload_to_google_file_api(b"data", "image/png", cache=cache, client=mock_client)
+ assert res_cached == {"uri": "gs://test-bucket/file-1", "mime": "image/png"}
+ mock_client.files.upload.assert_not_called()
+
+ # 3. Default client creation test (mocking client = None)
+ mock_genai_client = MagicMock()
+ mock_genai_client.files.upload.return_value = mock_upload_res
+ with patch("google.genai.Client", return_value=mock_genai_client):
+ res_default_client = await upload_to_google_file_api(b"data_other", "image/png", client=None)
+ assert res_default_client == {"uri": "gs://test-bucket/file-1", "mime": "image/png"}
+
+def test_openai_helpers():
+ search_tool = create_openai_file_search_tool(["file-1"])
+ assert search_tool == {
+ "type": "file_search",
+ "file_search": {
+ "vector_store_ids": []
+ }
+ }
+
+ attachment = create_openai_file_attachment("file-2", ["file_search"])
+ assert attachment == {
+ "file_id": "file-2",
+ "tools": [{"type": "file_search"}]
+ }
diff --git a/tests/storage/test_init.py b/tests/storage/test_init.py
new file mode 100644
index 0000000..abf3135
--- /dev/null
+++ b/tests/storage/test_init.py
@@ -0,0 +1,11 @@
+import pytest
+import agentflow.storage
+
+
+def test_storage_lazy_exports():
+ assert agentflow.storage.make_agent_memory_tool is not None
+ assert agentflow.storage.make_user_memory_tool is not None
+ assert agentflow.storage.memory_tool is not None
+
+ with pytest.raises(AttributeError):
+ _ = agentflow.storage.invalid_attribute_name_xxx
diff --git a/tests/testing/test_quick_test.py b/tests/testing/test_quick_test.py
index 7b8e0de..1eefb75 100644
--- a/tests/testing/test_quick_test.py
+++ b/tests/testing/test_quick_test.py
@@ -237,3 +237,58 @@ def test_extract_response_last_assistant(self):
]
result = QuickTest._extract_response({"messages": msgs})
assert result == "final response"
+
+ @pytest.mark.asyncio
+ async def test_with_tools_actual_execution(self):
+ """Test QuickTest.with_tools actual execution route."""
+ # This will call QuickTest.with_tools, which routes through ToolNode
+ result = await QuickTest.with_tools(
+ query="Tell me the weather in SF",
+ response="The weather in SF is sunny",
+ tools=["get_weather"],
+ tool_responses={"get_weather": "sunny"}
+ )
+ assert result is not None
+ assert result.final_response == "The weather in SF is sunny"
+ assert len(result.tool_calls) > 0
+ assert result.tool_calls[0]["name"] == "get_weather"
+
+ @pytest.mark.asyncio
+ async def test_multi_turn_fallback_no_state_context(self):
+ """Test multi_turn fallback when state context is not present."""
+ # We can mock compiled.ainvoke to return a dict without state context
+ # to hit line 150 of quick_test.py
+ from agentflow.core.graph.compiled_graph import CompiledGraph
+ from agentflow.core.state import Message
+ from unittest.mock import AsyncMock, MagicMock, patch
+
+ mock_compiled = MagicMock(spec=CompiledGraph)
+ mock_compiled.ainvoke = AsyncMock(return_value={
+ "messages": [Message.text_message("User message", role="user"), Message.text_message("Response", role="assistant")]
+ })
+
+ with patch("agentflow.core.graph.StateGraph.compile", return_value=mock_compiled):
+ result = await QuickTest.multi_turn(
+ conversation=[("User message", "Response")]
+ )
+ assert result.final_response == "Response"
+ assert len(result.messages) == 2
+
+ def test_extract_response_no_text_method_fallback(self):
+ """Test extract response fallback when message text attribute is not a method/callable."""
+ # Create a mock message where hasattr(msg, "text") is False but hasattr(msg, "content") is True
+ msg = type("MockMsg", (), {
+ "role": "assistant",
+ "content": "Fall back to content string"
+ })
+ result = QuickTest._extract_response({"messages": [msg]})
+ assert result == "Fall back to content string"
+
+ # Create a mock message where msg has no content but has role
+ msg2 = type("MockMsg", (), {
+ "role": "assistant"
+ })
+ result = QuickTest._extract_response({"messages": [msg2]})
+ # str(msg2) is returned
+ assert "MockMsg" in result
+
diff --git a/tests/utils/test_call_llm.py b/tests/utils/test_call_llm.py
index 319ada3..90ecadc 100644
--- a/tests/utils/test_call_llm.py
+++ b/tests/utils/test_call_llm.py
@@ -9,6 +9,9 @@
from agentflow.core.llm.caller import (
_extract_responses_text,
call_llm,
+ _call_google,
+ _call_openai_responses,
+ _call_openai_chat,
)
@@ -157,3 +160,136 @@ def test_extract_falls_back_to_output_items():
def test_extract_returns_empty_when_no_text():
r = _make_response(output_text=None, output=[])
assert _extract_responses_text(r) == ""
+
+
+# ---------------------------------------------------------------------------
+# Direct private call implementations
+# ---------------------------------------------------------------------------
+
+@pytest.mark.anyio
+async def test_call_google_implementation():
+ mock_client = MagicMock()
+ mock_response = MagicMock()
+ mock_response.text = " hello google "
+ mock_response.usage_metadata.prompt_token_count = 10
+ mock_response.usage_metadata.candidates_token_count = 5
+ mock_response.usage_metadata.cached_content_token_count = 3
+
+ mock_client.aio.models.generate_content = AsyncMock(return_value=mock_response)
+
+ # 1. With system prompt, temperature, json mode
+ res = await _call_google(
+ mock_client,
+ "gemini-2.0-flash",
+ "user prompt",
+ system_prompt="sys prompt",
+ max_tokens=100,
+ temperature=0.5,
+ json_mode=True
+ )
+
+ assert res == ("hello google", 10, 5, 3)
+ mock_client.aio.models.generate_content.assert_called_once()
+ _, kwargs = mock_client.aio.models.generate_content.call_args
+ assert kwargs["model"] == "gemini-2.0-flash"
+ assert kwargs["contents"] == "user prompt"
+ config = kwargs["config"]
+ assert config.max_output_tokens == 100
+ assert config.temperature == 0.5
+ assert config.response_mime_type == "application/json"
+ assert config.system_instruction == "sys prompt"
+
+ # 2. With cached content (system prompt should be ignored)
+ mock_client.aio.models.generate_content.reset_mock()
+ res = await _call_google(
+ mock_client,
+ "gemini-2.0-flash",
+ "user prompt",
+ system_prompt="sys prompt",
+ max_tokens=100,
+ temperature=0.5,
+ json_mode=False,
+ cached_content="cachedContents/abc123"
+ )
+
+ _, kwargs = mock_client.aio.models.generate_content.call_args
+ config = kwargs["config"]
+ assert config.cached_content == "cachedContents/abc123"
+ assert getattr(config, "system_instruction", None) is None
+
+
+@pytest.mark.anyio
+async def test_call_openai_responses_implementation():
+ mock_client = MagicMock()
+ mock_response = MagicMock()
+ mock_response.output_text = " hello responses "
+
+ mock_response.usage.input_tokens = 20
+ mock_response.usage.output_tokens = 15
+ mock_response.usage.input_tokens_details.cached_tokens = 7
+
+ mock_client.responses.create = AsyncMock(return_value=mock_response)
+
+ res = await _call_openai_responses(
+ mock_client,
+ "gpt-4o-mini",
+ "user prompt",
+ system_prompt="sys prompt",
+ max_tokens=200,
+ temperature=0.7,
+ json_mode=True,
+ extra_param="extra_val"
+ )
+
+ assert res == ("hello responses", 20, 15, 7)
+
+ mock_client.responses.create.assert_called_once()
+ _, kwargs = mock_client.responses.create.call_args
+ assert kwargs["model"] == "gpt-4o-mini"
+ assert kwargs["input"] == "user prompt"
+ assert kwargs["max_output_tokens"] == 200
+ assert kwargs["temperature"] == 0.7
+ assert kwargs["instructions"] == "sys prompt"
+ assert kwargs["text"] == {"format": {"type": "json_object"}}
+ assert kwargs["extra_param"] == "extra_val"
+
+
+@pytest.mark.anyio
+async def test_call_openai_chat_implementation():
+ mock_client = MagicMock()
+ mock_response = MagicMock()
+
+ choice = MagicMock()
+ choice.message.content = " hello chat "
+ mock_response.choices = [choice]
+
+ mock_response.usage.prompt_tokens = 30
+ mock_response.usage.completion_tokens = 25
+ mock_response.usage.prompt_tokens_details.cached_tokens = 12
+
+ mock_client.chat.completions.create = AsyncMock(return_value=mock_response)
+
+ res = await _call_openai_chat(
+ mock_client,
+ "gpt-4o-mini",
+ "user prompt",
+ system_prompt="sys prompt",
+ max_tokens=300,
+ temperature=0.8,
+ json_mode=True,
+ another_param="another_val"
+ )
+
+ assert res == ("hello chat", 30, 25, 12)
+
+ mock_client.chat.completions.create.assert_called_once()
+ _, kwargs = mock_client.chat.completions.create.call_args
+ assert kwargs["model"] == "gpt-4o-mini"
+ assert kwargs["messages"] == [
+ {"role": "system", "content": "sys prompt"},
+ {"role": "user", "content": "user prompt"}
+ ]
+ assert kwargs["max_tokens"] == 300
+ assert kwargs["temperature"] == 0.8
+ assert kwargs["response_format"] == {"type": "json_object"}
+ assert kwargs["another_param"] == "another_val"