From df5dc130bdb8b94f1909e3a7c80944a43b04ad6d Mon Sep 17 00:00:00 2001
From: Shudipto Trafder <shudiptotrafder@gmail.com>
Date: Sat, 13 Jun 2026 20:50:27 +0600
Subject: [PATCH 1/2] feat: add evaluation report JSON structure and tests for
 import integrity

- Added a new evaluation report JSON file to capture evaluation results and metadata.
- Updated test files to ensure no dead imports from removed module paths in documentation and examples.
- Implemented a regression guard for import order to prevent ImportError issues.
- Refactored imports in example files to align with the new module structure.
---
 CLAUDE.md                                     | 182 ++++
 README.md                                     |  87 +-
 agentflow/core/__init__.py                    |  71 +-
 agentflow/core/graph/__init__.py              |   2 +-
 agentflow/core/skills/__init__.py             |   4 +-
 agentflow/qa/evaluation/__init__.py           |   4 +-
 .../qa/evaluation/collectors/__init__.py      |   2 +-
 agentflow/qa/evaluation/config/__init__.py    |   4 +-
 agentflow/qa/evaluation/criteria/__init__.py  |   2 +-
 agentflow/qa/evaluation/dataset/__init__.py   |   2 +-
 agentflow/qa/evaluation/evaluator.py          |   4 +-
 agentflow/qa/evaluation/reporters/manager.py  |   4 +-
 .../evaluation/simulators/user_simulator.py   |   2 +-
 agentflow/qa/evaluation/testing.py            |   4 +-
 agentflow/qa/testing/__init__.py              |   4 +-
 eval_reports/s-file_20260613_204737.html      | 954 ++++++++++++++++++
 eval_reports/s-file_20260613_204737.json      | 123 +++
 .../evaluation/test1/test_weather_agent.py    |   5 +-
 examples/evaluation/test_graph/__init__.py    |   2 +-
 examples/github-mcp/git_mcp.py                |   2 +-
 examples/tool-decorator/README.md             |   2 +-
 tests/test_docs_imports.py                    | 115 +++
 tests/test_import_order.py                    |  66 ++
 23 files changed, 1564 insertions(+), 83 deletions(-)
 create mode 100644 CLAUDE.md
 create mode 100644 eval_reports/s-file_20260613_204737.html
 create mode 100644 eval_reports/s-file_20260613_204737.json
 create mode 100644 tests/test_docs_imports.py
 create mode 100644 tests/test_import_order.py

diff --git a/CLAUDE.md b/CLAUDE.md
new file mode 100644
index 00000000..ed462493
--- /dev/null
+++ b/CLAUDE.md
@@ -0,0 +1,182 @@
+# agentflow (core Python library) — Engineering Guide
+
+This file documents the **core Python framework** only (`10xscale-agentflow`, the package that
+lives in this folder). For the API/CLI, TS client, docs, or playground, see the CLAUDE.md in
+their respective folders and the workspace-root `CLAUDE.md` for the monorepo overview.
+
+- Package name (PyPI): `10xscale-agentflow`
+- Version: `0.7.5.1` (single source of truth: `pyproject.toml`)
+- Requires: Python >= 3.12
+- Importable top-level package lives at `agentflow/agentflow/` (this folder is the repo root;
+  the importable package is the nested `agentflow/` directory).
+
+## What this package is
+
+A graph-based orchestration engine for multi-agent LLM systems. It is **LLM-agnostic**: you bring
+the provider SDK (OpenAI / Google GenAI), and Agentflow provides the workflow engine, state,
+persistence, tools, memory, evaluation, and event publishing. Inspired by LangGraph but simpler.
+
+## Working principles for this codebase
+
+- **Read before writing.** The public API is large and re-exported through many `__init__.py`
+  files. Confirm the real export path before referencing a symbol (see Import Map below).
+- **Examples are the source of truth**, not the README. `examples/` uses current import paths;
+  the README and several docstrings still show pre-refactor paths (see Known Doc Drift).
+- **Surgical edits.** This is `Development Status :: 5 - Production/Stable`. Don't refactor
+  module boundaries or rename exports without checking every `__init__.py` that re-exports them.
+- **Keep coverage green.** `pytest` enforces `--cov-fail-under=70`. New code needs tests.
+- **Optional deps are optional.** Provider SDKs, MCP, Postgres, Redis, Qdrant, Mem0, Kafka,
+  RabbitMQ, OTEL, a2a are all extras. Guard imports; never make core import a hard optional dep.
+
+## Package layout (real, current)
+
+The importable package is `agentflow/agentflow/`. Top-level subpackages:
+
+| Subpackage | What lives there |
+|---|---|
+| `core/` | The engine. `graph/` (StateGraph, Agent, ToolNode, CompiledGraph, Node, Edge), `state/` (AgentState, Message, content blocks, reducers, context managers), `llm/` (provider detection + client factory + `call_llm`), `skills/` (dynamic skill injection), `exceptions/` |
+| `storage/` | `checkpointer/` (InMemory, Pg), `store/` (vector/long-term memory: Qdrant, Mem0, embeddings), `media/` (multimodal media processing, offload, resolvers, stores) |
+| `runtime/` | `adapters/llm/` (OpenAI / OpenAI-Responses / Google GenAI response converters), `publisher/` (Console, Redis, Kafka, RabbitMQ, OTEL, Composite), `protocols/` (a2a, acp) |
+| `prebuilt/` | `agent/` (React, RAG, PlanActReflect, SupervisorTeam, Swarm, StructuredOutput), `tools/` (calculator, fetch, files, handoff, memory, search) |
+| `qa/` | `evaluation/` (criteria, datasets, evaluator, reporters, simulators) and `testing/` (TestAgent, mocks, quick tests) |
+| `utils/` | constants (START/END/ResponseGranularity), `tool` decorator, `convert_messages`, callbacks, validators, id generators, background tasks, graceful shutdown |
+
+## Import Map (verified) — this is the part that bites people
+
+The package was restructured into `core/`, `storage/`, `runtime/`, `qa/`. **There are no
+top-level `agentflow.graph`, `agentflow.state`, `agentflow.checkpointer`, `agentflow.skills`,
+`agentflow.evaluation`, `agentflow.testing`, `agentflow.adapters`, or `agentflow.publisher`
+shims.** Those paths raise `ModuleNotFoundError`. Use the canonical paths:
+
+```python
+# Graph engine
+from agentflow.core.graph import Agent, StateGraph, ToolNode, CompiledGraph, Node, Edge, RetryConfig
+# or the aggregate: from agentflow.core import StateGraph, Agent, ToolNode, AgentState, Message, ...
+
+# State and messages
+from agentflow.core.state import AgentState, Message, TextBlock, ToolResultBlock, add_messages
+
+# LLM client/provider helpers
+from agentflow.core.llm import call_llm, create_llm_client, detect_provider
+
+# Skills
+from agentflow.core.skills import SkillConfig, SkillMeta, SkillsRegistry
+
+# Persistence
+from agentflow.storage.checkpointer import InMemoryCheckpointer, PgCheckpointer, BaseCheckpointer
+# Vector / long-term memory
+from agentflow.storage.store import QdrantStore, Mem0Store, MemoryConfig, AgentMemoryConfig
+
+# Publishers / converters
+from agentflow.runtime.publisher import ConsolePublisher, RedisPublisher, KafkaPublisher, RabbitMQPublisher
+from agentflow.runtime.adapters.llm import OpenAIConverter, GoogleGenAIConverter, OpenAIResponsesConverter
+
+# Prebuilt
+from agentflow.prebuilt.agent import ReactAgent, RAGAgent, SwarmAgent, SupervisorTeamAgent
+from agentflow.prebuilt.tools import safe_calculator, fetch_url, create_handoff_tool, memory_tool
+
+# QA
+from agentflow.qa.evaluation import AgentEvaluator, EvalConfig, EvalCase, EvalSet
+from agentflow.qa.testing import TestAgent, MockMCPClient, MockToolRegistry
+
+# Utils
+from agentflow.utils import tool, convert_messages, Command
+from agentflow.utils.constants import START, END, ResponseGranularity
+```
+
+Note: the root `agentflow/__init__.py` is intentionally empty. Importing the package does not
+eagerly pull in submodules; import the subpackage you need.
+
+## Core concepts
+
+**StateGraph -> CompiledGraph.** Build with `StateGraph()`, `add_node`, `add_edge`,
+`add_conditional_edges`, `set_entry_point`; then `.compile(...)` returns a `CompiledGraph`.
+`compile()` accepts: `checkpointer`, `store`, `media_store`, `interrupt_before`,
+`interrupt_after`, `callback_manager`, `shutdown_timeout` (default 30.0).
+
+**CompiledGraph execution API:** `invoke` / `ainvoke` (run), `stream` / `astream` (incremental),
+`stop` / `astop` (interrupt), `override_node`, `attach_remote_tools`, `generate_graph`, `aclose`.
+- Input shape: `{"messages": [Message...]}`.
+- Config keys: `user_id`, `thread_id`, `run_id`, `recursion_limit` (default 25).
+- `response_granularity`: `LOW` (messages only, default), `PARTIAL` (context+summary+messages),
+  `FULL` (full state).
+
+**Agent class** (`agentflow.core.graph.Agent`) — the high-level node that wraps LLM calls,
+message conversion, and tool integration. Key constructor params:
+`model` (required), `output_type="text"`, `system_prompt`, `tool_node` (name or ToolNode),
+`extra_messages`, `trim_context`, `tools_tags`, `reasoning_config`, `skills`, `memory`,
+`retry_config` (default True), `fallback_models`, `multimodal_config`, `output_schema`.
+
+**Model strings and providers.** `detect_provider(model)` infers the provider from a
+`"provider/model"` prefix or the model name. **It only resolves to `"google"` or `"openai"`.**
+Examples: `"gemini/gemini-2.5-flash"`, `"openai/gpt-4o"`, `"gpt-4o-mini"`. Vertex AI is selected
+via `use_vertex_ai=True`. There is **no native Anthropic client** in the LLM factory despite
+Anthropic/Claude appearing in marketing copy; Claude is reachable only via an OpenAI-compatible
+endpoint or the custom-functions approach. Verify before promising native Claude support.
+
+**ToolNode.** `ToolNode(tools, client=None, pass_user_info_to_mcp=False)`. First positional arg
+is `tools` (an iterable of callables). `client` is an MCP client (fastmcp/mcp). Tools run in
+**parallel** when the LLM requests several at once. Define tools as plain functions; injectable
+params (`tool_call_id`, `state`, `config`, plus InjectQ-provided deps) are filled automatically.
+
+**State and Message.** `AgentState` is a Pydantic model; subclass it for custom fields.
+`Message.text_message(content, role="user")` is the text factory. `Message.tool_message(...)`,
+`Message.image_message(...)` exist. There is **no `Message.from_text`** (README shows it; it is
+wrong). Content is a list of typed blocks (TextBlock, ImageBlock, ToolCallBlock, ToolResultBlock,
+ReasoningBlock, etc.). Reducers (`add_messages`, `replace_messages`, `append_items`) control how
+state lists merge.
+
+**Persistence.** `InMemoryCheckpointer` for dev/tests. `PgCheckpointer` (Postgres + Redis dual
+layer) for production; requires `[pg_checkpoint]`.
+
+**Memory / store.** 3-layer model: working state -> checkpointer (hot/durable) -> vector store
+(Qdrant/Mem0) for long-term. `MemoryConfig` / `AgentMemoryConfig` drive it; `memory_tool` and
+`create_memory_preload_node` wire it into a graph.
+
+**Skills.** `SkillConfig(skills_dir=...)` adds dynamic skill injection. Two modes: `on-demand`
+(LLM calls `set_skill()` from a trigger table) and `session` (preload a fixed skill from a state
+field via `preload_from`).
+
+**Publishers.** Emit execution events to Console, Redis Pub/Sub, Kafka, RabbitMQ, or OTEL.
+`CompositePublisher` fans out to several. OTEL publisher provides tracing (`setup_tracing`).
+
+**QA.** `agentflow.qa.evaluation` is a full eval framework (criteria incl. LLM-as-judge,
+trajectory matching, rubric, safety, hallucination; datasets; console/JSON/HTML/JUnit reporters;
+user simulators). `agentflow.qa.testing` provides `TestAgent`, `MockMCPClient`, `MockToolRegistry`,
+`TestContext` for unit-testing graphs without live LLMs.
+
+## Development workflow
+
+This repo root is `agentflow/`; the importable package is `agentflow/agentflow/`. A `.venv` is
+already present.
+
+```bash
+# from this folder (agentflow/)
+.venv/bin/python -m pytest               # full suite (enforces coverage >= 70%)
+.venv/bin/python -m pytest tests/graph   # one area
+ruff check . && ruff format .            # lint + format (line-length 100, py312)
+# editable install with extras for local dev:
+pip install -e ".[google-genai,openai,mcp,pg_checkpoint]"
+```
+
+- Tests live in `tests/` (mirrors package layout: `graph/`, `state/`, `storage/`, `store/`,
+  `checkpointer/`, `publisher/`, `prebuilt/`, `evaluation/`, `testing/`, plus `chaos/`,
+  `benchmarks/`, `integration/`). Markers: `asyncio`, `integration` (needs real DBs), `slow`.
+- Lint config is in `pyproject.toml` `[tool.ruff]` (broad rule set; per-file ignores for a few
+  large modules). `mypy` and `bandit` are also configured there.
+- `examples/` is organized by feature (react, rag, swarm, supervisor_team, memory, skills, mcp,
+  a2a_sdk, evaluation, testing, multimodal, structured_output, ...). Use these as canonical usage.
+
+## Known doc drift (do not copy from these without checking)
+
+- **README.md import paths are stale.** It imports `agentflow.graph`, `agentflow.state`,
+  `agentflow.checkpointer` — all removed. Real paths are `agentflow.core.*` / `agentflow.storage.*`.
+- **`Message.from_text` does not exist** (README uses it). Use `Message.text_message`.
+- **`ToolNode(functions=...)`** keyword is wrong (README MCP example). The param is `tools`.
+- A few `examples/` files still use dead paths (`agentflow.state.message`, `agentflow.graph.tool_node`,
+  `agentflow.evaluation.*`). Treat those specific files as broken until fixed.
+- README/docstrings imply native Anthropic support; the LLM factory only builds google/openai
+  clients. See Model strings above.
+
+When you touch any of the above, prefer fixing the doc/example to match the code rather than the
+reverse, unless the export path itself is the bug.
diff --git a/README.md b/README.md
index d0776d0c..f7cb0290 100644
--- a/README.md
+++ b/README.md
@@ -1,9 +1,9 @@
 
 # 10xScale Agentflow
 
-![PyPI](https://img.shields.io/pypi/v/agentflow?color=blue)
+![PyPI](https://img.shields.io/pypi/v/10xscale-agentflow?color=blue)
 ![License](https://img.shields.io/github/license/10xhub/agentflow)
-![Python](https://img.shields.io/pypi/pyversions/agentflow)
+![Python](https://img.shields.io/pypi/pyversions/10xscale-agentflow)
 [![Coverage](https://img.shields.io/badge/coverage-74%25-yellow.svg)](#)
 
 **10xScale Agentflow** is a lightweight Python framework for building intelligent agents and orchestrating multi-agent workflows. It's an **LLM-agnostic orchestration tool** that works with native SDKs from OpenAI, Google Gemini, Anthropic Claude, or any other provider. You choose your LLM library; 10xScale Agentflow provides the workflow orchestration.
@@ -17,7 +17,7 @@
 - **🤖 Multi-Agent Workflows** - Build complex agent systems with your choice of orchestration patterns
 - **📊 Structured Responses** - Get `content`, optional `thinking`, and `usage` in a standardized format
 - **🌊 Streaming Support** - Real-time incremental responses with delta updates
-- **🔧 Tool Integration** - Native support for function calling, MCP, Composio, and LangChain tools with **parallel execution**
+- **🔧 Tool Integration** - Native support for function calling and MCP tools with **parallel execution**
 - **🔀 LangGraph-Inspired Engine** - Flexible graph orchestration with nodes, conditional edges, and control flow
 - **💾 State Management** - Built-in persistence with in-memory and PostgreSQL+Redis checkpointers
 - **🔄 Human-in-the-Loop** - Pause/resume execution for approval workflows and debugging
@@ -51,8 +51,6 @@ Agentflow stands out with powerful features designed for production-grade AI app
    - Remote tools (via TypeScript SDK)
    - Agent handoff tools (multi-agent collaboration)
    - MCP (Model Context Protocol)
-   - LangChain tools
-   - Composio tools
 
 ### 🎯 **Intelligent Context Management**
 
@@ -76,7 +74,7 @@ Agentflow stands out with powerful features designed for production-grade AI app
    - Kafka
    - RabbitMQ
    - Redis Pub/Sub
-   - OpenTelemetry (planned)
+   - OpenTelemetry
    - Custom publishers
 
 ### 🔄 **Advanced Execution Features**
@@ -155,19 +153,21 @@ pip install 10xscale-agentflow[mcp]
 # Google GenAI adapter (google-genai SDK)
 pip install 10xscale-agentflow[google-genai]
 
-# Composio tools (adapter)
-pip install 10xscale-agentflow[composio]
+# OpenAI adapter (openai SDK)
+pip install 10xscale-agentflow[openai]
 
-# LangChain tools (registry-based adapter)
-pip install 10xscale-agentflow[langchain]
+# Vector / long-term memory stores
+pip install 10xscale-agentflow[qdrant]    # Qdrant store
+pip install 10xscale-agentflow[mem0]      # Mem0 store
 
 # Individual publishers
 pip install 10xscale-agentflow[redis]     # Redis publisher
 pip install 10xscale-agentflow[kafka]     # Kafka publisher
 pip install 10xscale-agentflow[rabbitmq]  # RabbitMQ publisher
+pip install 10xscale-agentflow[otel]      # OpenTelemetry tracing
 
 # Multiple extras
-pip install 10xscale-agentflow[pg_checkpoint,mcp,google-genai,composio,langchain]
+pip install 10xscale-agentflow[pg_checkpoint,mcp,google-genai,openai]
 ```
 
 ### Environment Setup
@@ -204,8 +204,8 @@ If you have a `.env` file, it will be auto-loaded (via `python-dotenv`).
 Here's a complete tool-calling agent in under 30 lines:
 
 ```python
-from agentflow.graph import Agent, StateGraph, ToolNode
-from agentflow.state import AgentState, Message
+from agentflow.core.graph import Agent, StateGraph, ToolNode
+from agentflow.core.state import AgentState, Message
 from agentflow.utils.constants import END
 
 
@@ -220,7 +220,7 @@ graph = StateGraph()
 graph.add_node("MAIN", Agent(
     model="gemini/gemini-2.5-flash",
     system_prompt=[{"role": "system", "content": "You are a helpful assistant."}],
-    tool_node_name="TOOL"
+    tool_node="TOOL"
 ))
 graph.add_node("TOOL", ToolNode([get_weather]))
 
@@ -259,12 +259,11 @@ For maximum control, use custom functions instead of the Agent class:
 from dotenv import load_dotenv
 from openai import AsyncOpenAI
 
-from agentflow.checkpointer import InMemoryCheckpointer
-from agentflow.graph import StateGraph, ToolNode
-from agentflow.state.agent_state import AgentState
-from agentflow.utils import Message
+from agentflow.core.graph import StateGraph, ToolNode
+from agentflow.core.state import AgentState, Message
+from agentflow.storage.checkpointer import InMemoryCheckpointer
+from agentflow.utils import convert_messages
 from agentflow.utils.constants import END
-from agentflow.utils.converter import convert_messages
 
 load_dotenv()
 client = AsyncOpenAI()
@@ -353,7 +352,7 @@ graph.set_entry_point("MAIN")
 # Compile and run
 app = graph.compile(checkpointer=InMemoryCheckpointer())
 
-inp = {"messages": [Message.from_text("What's the weather in New York?")]}
+inp = {"messages": [Message.text_message("What's the weather in New York?")]}
 config = {"thread_id": "12345", "recursion_limit": 10}
 
 res = app.invoke(inp, config=config)
@@ -427,12 +426,11 @@ from dotenv import load_dotenv
 from fastmcp import Client
 from openai import AsyncOpenAI
 
-from agentflow.checkpointer import InMemoryCheckpointer
-from agentflow.graph import StateGraph, ToolNode
-from agentflow.state.agent_state import AgentState
-from agentflow.utils import Message
+from agentflow.core.graph import StateGraph, ToolNode
+from agentflow.core.state import AgentState, Message
+from agentflow.storage.checkpointer import InMemoryCheckpointer
+from agentflow.utils import convert_messages
 from agentflow.utils.constants import END
-from agentflow.utils.converter import convert_messages
 
 load_dotenv()
 client = AsyncOpenAI()
@@ -451,7 +449,7 @@ config = {
 client_http = Client(config)
 
 # Initialize ToolNode with MCP client
-tool_node = ToolNode(functions=[], client=client_http)
+tool_node = ToolNode([], client=client_http)
 
 
 async def main_agent(state: AgentState):
@@ -509,7 +507,7 @@ graph.set_entry_point("MAIN")
 app = graph.compile(checkpointer=checkpointer)
 
 # Run the agent
-inp = {"messages": [Message.from_text("Please call the get_weather function for New York City")]}
+inp = {"messages": [Message.text_message("Please call the get_weather function for New York City")]}
 config = {"thread_id": "12345", "recursion_limit": 10}
 
 res = app.invoke(inp, config=config)
@@ -551,12 +549,11 @@ import logging
 from dotenv import load_dotenv
 from openai import AsyncOpenAI
 
-from agentflow.checkpointer import InMemoryCheckpointer
-from agentflow.graph import StateGraph, ToolNode
-from agentflow.state.agent_state import AgentState
-from agentflow.utils import Message, ResponseGranularity
+from agentflow.core.graph import StateGraph, ToolNode
+from agentflow.core.state import AgentState, Message
+from agentflow.storage.checkpointer import InMemoryCheckpointer
+from agentflow.utils import ResponseGranularity, convert_messages
 from agentflow.utils.constants import END
-from agentflow.utils.converter import convert_messages
 
 load_dotenv()
 client = AsyncOpenAI()
@@ -647,7 +644,7 @@ app = graph.compile(checkpointer=checkpointer)
 
 
 async def run_stream_test():
-    inp = {"messages": [Message.from_text("Call get_weather for Tokyo, then reply.")]}
+    inp = {"messages": [Message.text_message("Call get_weather for Tokyo, then reply.")]}
     config = {"thread_id": "stream-1", "recursion_limit": 10}
 
     logging.info("--- streaming start ---")
@@ -693,7 +690,7 @@ python examples/react_stream/stream_react_agent.py
 # Parallel execution:   max(1.0, 1.5, 0.8) = 1.5 seconds ⚡
 ```
 
-See the [parallel tool execution documentation](https://10xhub.github.io/10xScale Agentflow/Concept/graph/tools/#parallel-tool-execution) for more details.
+See the [parallel tool execution documentation](https://10xhub.github.io/Agentflow/Concept/graph/tools/#parallel-tool-execution) for more details.
 
 ---
 
@@ -778,14 +775,14 @@ See `pyproject.dev.toml` for complete tool configurations.
 
 - ✅ Core graph engine with nodes and edges
 - ✅ State management and checkpointing
-- ✅ Tool integration (MCP, Composio, LangChain)
+- ✅ Tool integration (MCP, custom tools, parallel execution)
 - ✅ **Parallel tool execution** for improved performance
 - ✅ Streaming and event publishing
 - ✅ Human-in-the-loop support
 - ✅ Prebuilt agent patterns
-- 🚧 Agent-to-Agent (A2A) communication protocols
+- ✅ Agent-to-Agent (A2A) communication protocols
+- ✅ Observability and tracing (OpenTelemetry)
 - 🚧 Remote node execution for distributed processing
-- 🚧 Enhanced observability and tracing
 - 🚧 More persistence backends (Redis, DynamoDB)
 - 🚧 Parallel/branching strategies
 - 🚧 Visual graph editor
@@ -794,16 +791,16 @@ See `pyproject.dev.toml` for complete tool configurations.
 
 ## 📄 License
 
-MIT License - see [LICENSE](https://github.com/10xhub/10xScale Agentflow/blob/main/LICENSE) for details.
+MIT License - see [LICENSE](https://github.com/10xHub/agentflow/blob/main/LICENSE) for details.
 
 ---
 
 ## 🔗 Links & Resources
 
-- **[Documentation](https://10xhub.github.io/10xScale Agentflow/)** - Full documentation with tutorials and API reference
+- **[Documentation](https://10xhub.github.io/Agentflow/)** - Full documentation with tutorials and API reference
 - **[GitHub Repository](https://github.com/10xhub/10xScale Agentflow)** - Source code and issues
 - **[PyPI Project](https://pypi.org/project/10xScale-Agentflow/)** - Package releases
-- **[Examples Directory](https://github.com/10xhub/10xScale Agentflow/tree/main/examples)** - Runnable code samples
+- **[Examples Directory](https://github.com/10xHub/agentflow/tree/main/examples)** - Runnable code samples
 
 ---
 
@@ -820,11 +817,11 @@ Contributions are welcome! Please see our [GitHub repository](https://github.com
 
 ## 💬 Support
 
-- **Documentation**: [https://10xhub.github.io/10xScale Agentflow/](https://10xhub.github.io/10xScale Agentflow/)
-- **Examples**: Check the [examples directory](https://github.com/10xhub/10xScale Agentflow/tree/main/examples)
-- **Issues**: Report bugs on [GitHub Issues](https://github.com/10xhub/10xScale Agentflow/issues)
-- **Discussions**: Ask questions in [GitHub Discussions](https://github.com/10xhub/10xScale Agentflow/discussions)
+- **Documentation**: [https://10xhub.github.io/Agentflow/](https://10xhub.github.io/Agentflow/)
+- **Examples**: Check the [examples directory](https://github.com/10xHub/agentflow/tree/main/examples)
+- **Issues**: Report bugs on [GitHub Issues](https://github.com/10xHub/agentflow/issues)
+- **Discussions**: Ask questions in [GitHub Discussions](https://github.com/10xHub/agentflow/discussions)
 
 ---
 
-**Ready to build intelligent agents?** Check out the [documentation](https://10xhub.github.io/10xScale Agentflow/) to get started!
+**Ready to build intelligent agents?** Check out the [documentation](https://10xhub.github.io/Agentflow/) to get started!
diff --git a/agentflow/core/__init__.py b/agentflow/core/__init__.py
index ac03094c..dd80dca1 100644
--- a/agentflow/core/__init__.py
+++ b/agentflow/core/__init__.py
@@ -10,7 +10,9 @@
 
 from __future__ import annotations
 
-from . import exceptions, graph, skills, state
+import typing as _t
+
+from . import exceptions, skills, state
 
 # --- Exceptions ---
 from .exceptions import (
@@ -25,21 +27,64 @@
     TransientStorageError,
 )
 
-# --- Graph ---
-from .graph import (
-    Agent,
-    BaseAgent,
-    CompiledGraph,
-    Edge,
-    Node,
-    RetryConfig,
-    StateGraph,
-    ToolNode,
-)
-
 # --- Skills ---
 from .skills import SkillConfig, SkillMeta, SkillsRegistry
 
+
+# --- Graph (lazy) ---
+# The graph engine is imported lazily to avoid an import cycle: ``agentflow.core.graph`` imports
+# back into ``agentflow.utils`` and ``agentflow.storage.checkpointer``. Importing it eagerly here
+# means that ``import agentflow.utils`` or ``import agentflow.storage.checkpointer`` *as the first
+# import* triggers ``agentflow.core`` -> ``graph`` -> back into the half-initialized module and
+# raises ImportError. Deferring graph keeps ``from agentflow.core import StateGraph`` working while
+# letting those modules be imported in any order. See tests/test_import_order.py.
+_GRAPH_EXPORTS = frozenset(
+    {
+        "Agent",
+        "BaseAgent",
+        "CompiledGraph",
+        "Edge",
+        "Node",
+        "RetryConfig",
+        "StateGraph",
+        "ToolNode",
+    }
+)
+
+if _t.TYPE_CHECKING:
+    from . import graph
+    from .graph import (
+        Agent,
+        BaseAgent,
+        CompiledGraph,
+        Edge,
+        Node,
+        RetryConfig,
+        StateGraph,
+        ToolNode,
+    )
+
+
+def __getattr__(name: str) -> _t.Any:
+    """Lazily resolve the graph submodule and its exported symbols (PEP 562).
+
+    Uses ``importlib.import_module`` (not ``from . import graph``) so a re-entrant lookup while
+    ``graph`` is still importing returns the partial module from ``sys.modules`` directly instead
+    of recursing back through this hook via the parent-attribute binding.
+    """
+    if name == "graph" or name in _GRAPH_EXPORTS:
+        import importlib
+
+        graph = importlib.import_module(f"{__name__}.graph")
+        globals()["graph"] = graph  # cache so future lookups skip __getattr__
+        return graph if name == "graph" else getattr(graph, name)
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
+
+
+def __dir__() -> list[str]:
+    return sorted(set(globals()) | _GRAPH_EXPORTS | {"graph"})
+
+
 # --- State ---
 from .state import (
     AgentState,
diff --git a/agentflow/core/graph/__init__.py b/agentflow/core/graph/__init__.py
index 5334d30b..2c2affca 100644
--- a/agentflow/core/graph/__init__.py
+++ b/agentflow/core/graph/__init__.py
@@ -63,7 +63,7 @@
 ==============
 
     ```python
-    from agentflow.graph import StateGraph, ToolNode
+    from agentflow.core.graph import StateGraph, ToolNode
     from agentflow.utils import START, END
 
 
diff --git a/agentflow/core/skills/__init__.py b/agentflow/core/skills/__init__.py
index f44db151..6fe2a27f 100644
--- a/agentflow/core/skills/__init__.py
+++ b/agentflow/core/skills/__init__.py
@@ -5,7 +5,7 @@
 **on-demand** (default) — the LLM sees a trigger table and calls ``set_skill()``
 to load skill content when a user request matches a skill::
 
-    from agentflow.skills import SkillConfig
+    from agentflow.core.skills import SkillConfig
 
     agent = Agent(
         model="gpt-4o",
@@ -17,7 +17,7 @@
 domain/persona.  The framework reads a state field to identify which skill to
 preload, with no trigger table and no extra tool-call round-trip::
 
-    from agentflow.skills import SkillConfig
+    from agentflow.core.skills import SkillConfig
     from agentflow.core.state import AgentState
 
 
diff --git a/agentflow/qa/evaluation/__init__.py b/agentflow/qa/evaluation/__init__.py
index a3124d50..001cca9f 100644
--- a/agentflow/qa/evaluation/__init__.py
+++ b/agentflow/qa/evaluation/__init__.py
@@ -17,8 +17,8 @@
 
 Example:
     ```python
-    from agentflow.evaluation import AgentEvaluator, EvalConfig, CriterionConfig
-    from agentflow.evaluation.dataset import EvalCase, ToolCall
+    from agentflow.qa.evaluation import AgentEvaluator, EvalConfig, CriterionConfig
+    from agentflow.qa.evaluation.dataset import EvalCase, ToolCall
 
     case = EvalCase.single_turn(
         eval_id="test_1",
diff --git a/agentflow/qa/evaluation/collectors/__init__.py b/agentflow/qa/evaluation/collectors/__init__.py
index 14bf9299..0eaca2ff 100644
--- a/agentflow/qa/evaluation/collectors/__init__.py
+++ b/agentflow/qa/evaluation/collectors/__init__.py
@@ -12,7 +12,7 @@
 
 Example:
     ```python
-    from agentflow.evaluation.collectors import TrajectoryCollector, make_trajectory_callback
+    from agentflow.qa.evaluation.collectors import TrajectoryCollector, make_trajectory_callback
 
     collector = TrajectoryCollector()
     _, mgr = make_trajectory_callback(collector, config={"thread_id": "eval-1"})
diff --git a/agentflow/qa/evaluation/config/__init__.py b/agentflow/qa/evaluation/config/__init__.py
index 6fda9a65..dd09a83b 100644
--- a/agentflow/qa/evaluation/config/__init__.py
+++ b/agentflow/qa/evaluation/config/__init__.py
@@ -5,8 +5,8 @@
 
 Example:
     ```python
-    from agentflow.evaluation.config import EvalConfig, CriterionConfig
-    from agentflow.evaluation.config import EvalPresets, MatchType, Rubric
+    from agentflow.qa.evaluation.config import EvalConfig, CriterionConfig
+    from agentflow.qa.evaluation.config import EvalPresets, MatchType, Rubric
 
     # Use a preset
     config = EvalPresets.tool_usage(strict=True)
diff --git a/agentflow/qa/evaluation/criteria/__init__.py b/agentflow/qa/evaluation/criteria/__init__.py
index b538d49b..2a9c62f1 100644
--- a/agentflow/qa/evaluation/criteria/__init__.py
+++ b/agentflow/qa/evaluation/criteria/__init__.py
@@ -7,7 +7,7 @@
 
 Example:
     ```python
-    from agentflow.evaluation.criteria import (
+    from agentflow.qa.evaluation.criteria import (
         TrajectoryMatchCriterion,
         ResponseMatchCriterion,
         LLMJudgeCriterion,
diff --git a/agentflow/qa/evaluation/dataset/__init__.py b/agentflow/qa/evaluation/dataset/__init__.py
index ce1b085e..75599a0b 100644
--- a/agentflow/qa/evaluation/dataset/__init__.py
+++ b/agentflow/qa/evaluation/dataset/__init__.py
@@ -5,7 +5,7 @@
 
 Example:
     ```python
-    from agentflow.evaluation.dataset import (
+    from agentflow.qa.evaluation.dataset import (
         EvalSet,
         EvalCase,
         EvalSetBuilder,
diff --git a/agentflow/qa/evaluation/evaluator.py b/agentflow/qa/evaluation/evaluator.py
index 7f53c486..c80b95d9 100644
--- a/agentflow/qa/evaluation/evaluator.py
+++ b/agentflow/qa/evaluation/evaluator.py
@@ -55,8 +55,8 @@ class AgentEvaluator:
 
     Example:
         ```python
-        from agentflow.evaluation import AgentEvaluator, EvalConfig
-        from agentflow.evaluation.collectors import TrajectoryCollector, make_trajectory_callback
+        from agentflow.qa.evaluation import AgentEvaluator, EvalConfig
+        from agentflow.qa.evaluation.collectors import TrajectoryCollector, make_trajectory_callback
 
         collector = TrajectoryCollector(capture_all_events=True)
         _, callback_mgr = make_trajectory_callback(collector)
diff --git a/agentflow/qa/evaluation/reporters/manager.py b/agentflow/qa/evaluation/reporters/manager.py
index 466587a3..af2d8c68 100644
--- a/agentflow/qa/evaluation/reporters/manager.py
+++ b/agentflow/qa/evaluation/reporters/manager.py
@@ -61,8 +61,8 @@ class ReporterManager:
 
     Example:
         ```python
-        from agentflow.evaluation.config.eval_config import ReporterConfig
-        from agentflow.evaluation.reporters.manager import ReporterManager
+        from agentflow.qa.evaluation.config.eval_config import ReporterConfig
+        from agentflow.qa.evaluation.reporters.manager import ReporterManager
 
         manager = ReporterManager(ReporterConfig())
         output = manager.run_all(report)
diff --git a/agentflow/qa/evaluation/simulators/user_simulator.py b/agentflow/qa/evaluation/simulators/user_simulator.py
index 8baf73f0..70245137 100644
--- a/agentflow/qa/evaluation/simulators/user_simulator.py
+++ b/agentflow/qa/evaluation/simulators/user_simulator.py
@@ -143,7 +143,7 @@ class UserSimulator:
 
     Example:
         ```python
-        from agentflow.evaluation import (
+        from agentflow.qa.evaluation import (
             UserSimulator,
             ConversationScenario,
             SimulationGoalsCriterion,
diff --git a/agentflow/qa/evaluation/testing.py b/agentflow/qa/evaluation/testing.py
index 69c0d598..d5a4b3b2 100644
--- a/agentflow/qa/evaluation/testing.py
+++ b/agentflow/qa/evaluation/testing.py
@@ -235,7 +235,7 @@ class EvalFixtures:
     Example:
         ```python
         # conftest.py
-        from agentflow.evaluation.testing import EvalFixtures
+        from agentflow.qa.evaluation.testing import EvalFixtures
 
         fixtures = EvalFixtures()
         fixtures.register()
@@ -353,7 +353,7 @@ def create_eval_app(
     Example:
         ```python
         # conftest.py
-        from agentflow.evaluation.testing import create_eval_app
+        from agentflow.qa.evaluation.testing import create_eval_app
 
 
         @pytest.fixture(scope="session")
diff --git a/agentflow/qa/testing/__init__.py b/agentflow/qa/testing/__init__.py
index f3d4706e..c0f31e1a 100644
--- a/agentflow/qa/testing/__init__.py
+++ b/agentflow/qa/testing/__init__.py
@@ -10,7 +10,7 @@
 
 Example:
     ```python
-    from agentflow.testing import TestAgent, TestContext, MockToolRegistry
+    from agentflow.qa.testing import TestAgent, TestContext, MockToolRegistry
 
     # Use TestAgent as a drop-in replacement for Agent
     test_agent = TestAgent(responses=["Hello from test!"])
@@ -28,7 +28,7 @@
     assert tools.was_called("get_weather")
 
     # Use MockMCPClient for testing MCP tools
-    from agentflow.testing import MockMCPClient
+    from agentflow.qa.testing import MockMCPClient
 
     mock_mcp = MockMCPClient()
     mock_mcp.add_tool(
diff --git a/eval_reports/s-file_20260613_204737.html b/eval_reports/s-file_20260613_204737.html
new file mode 100644
index 00000000..2d8f33d1
--- /dev/null
+++ b/eval_reports/s-file_20260613_204737.html
@@ -0,0 +1,954 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>s-file</title>
+    <style>
+
+/* ── Theme variables ────────────────────────────────────────────────────── */
+:root {
+    --color-pass: #22c55e;
+    --color-fail: #ef4444;
+    --color-warn: #f59e0b;
+    --color-bg: #f8fafc;
+    --color-card: #ffffff;
+    --color-border: #e2e8f0;
+    --color-text: #1e293b;
+    --color-muted: #64748b;
+    --color-header-bg: #ffffff;
+    --color-bg-muted: #f3f4f6;
+    --color-accent: #6366f1;
+}
+
+[data-theme="dark"] {
+    --color-pass: #4ade80;
+    --color-fail: #f87171;
+    --color-warn: #fbbf24;
+    --color-bg: #0f172a;
+    --color-card: #1e293b;
+    --color-border: #334155;
+    --color-text: #f1f5f9;
+    --color-muted: #94a3b8;
+    --color-header-bg: #1e293b;
+    --color-bg-muted: #0f172a;
+    --color-accent: #818cf8;
+}
+
+@media (prefers-color-scheme: dark) {
+    :root:not([data-theme]) {
+        --color-pass: #4ade80;
+        --color-fail: #f87171;
+        --color-warn: #fbbf24;
+        --color-bg: #0f172a;
+        --color-card: #1e293b;
+        --color-border: #334155;
+        --color-text: #f1f5f9;
+        --color-muted: #94a3b8;
+        --color-header-bg: #1e293b;
+        --color-bg-muted: #0f172a;
+        --color-accent: #818cf8;
+    }
+}
+
+/* ── Reset ──────────────────────────────────────────────────────────────── */
+*, *::before, *::after { box-sizing: border-box; margin: 0; padding: 0; }
+
+body {
+    font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto,
+                 Oxygen, Ubuntu, sans-serif;
+    background-color: var(--color-bg);
+    color: var(--color-text);
+    line-height: 1.6;
+}
+
+.container { max-width: 1200px; margin: 0 auto; padding: 0 1.5rem 2rem; }
+
+/* ── Sticky header ──────────────────────────────────────────────────────── */
+.sticky-header {
+    position: sticky;
+    top: 0;
+    z-index: 100;
+    background: var(--color-header-bg);
+    border-bottom: 1px solid var(--color-border);
+    padding: 0.65rem 1.5rem;
+    margin: 0 -1.5rem 1.5rem;
+    box-shadow: 0 1px 8px rgb(0 0 0 / 0.07);
+}
+
+.header-inner {
+    max-width: 1200px;
+    margin: 0 auto;
+    display: flex;
+    align-items: center;
+    justify-content: space-between;
+    gap: 1rem;
+}
+
+/* ── Brand area ─────────────────────────────────────────────────────────── */
+.brand {
+    display: flex;
+    align-items: center;
+    gap: 0.6rem;
+    min-width: 0;
+}
+
+.brand-logo {
+    font-size: 1.6rem;
+    line-height: 1;
+    background: linear-gradient(135deg, #6366f1 0%, #8b5cf6 100%);
+    -webkit-background-clip: text;
+    -webkit-text-fill-color: transparent;
+    background-clip: text;
+    flex-shrink: 0;
+}
+
+.brand-text {
+    display: flex;
+    flex-direction: column;
+    line-height: 1.2;
+    flex-shrink: 0;
+}
+
+.brand-name {
+    font-size: 1.1rem;
+    font-weight: 800;
+    background: linear-gradient(135deg, #6366f1 0%, #8b5cf6 100%);
+    -webkit-background-clip: text;
+    -webkit-text-fill-color: transparent;
+    background-clip: text;
+    letter-spacing: -0.02em;
+}
+
+.brand-sub {
+    font-size: 0.6rem;
+    font-weight: 500;
+    text-transform: uppercase;
+    letter-spacing: 0.1em;
+    color: var(--color-muted);
+}
+
+.report-title-chip {
+    display: inline-block;
+    padding: 0.2rem 0.6rem;
+    background: var(--color-bg);
+    border: 1px solid var(--color-border);
+    border-radius: 999px;
+    font-size: 0.72rem;
+    font-weight: 500;
+    color: var(--color-muted);
+    white-space: nowrap;
+    overflow: hidden;
+    text-overflow: ellipsis;
+    max-width: 220px;
+}
+
+.header-meta {
+    display: flex;
+    align-items: center;
+    gap: 0.75rem;
+    flex-shrink: 0;
+}
+
+.timestamp { color: var(--color-muted); font-size: 0.8rem; }
+
+.header-link {
+    display: inline-flex;
+    align-items: center;
+    gap: 0.25rem;
+    padding: 0.25rem 0.65rem;
+    border: 1px solid var(--color-border);
+    border-radius: 6px;
+    background: var(--color-card);
+    color: var(--color-text);
+    text-decoration: none;
+    font-size: 0.78rem;
+    font-weight: 500;
+    transition: background 0.15s, border-color 0.15s, color 0.15s;
+    white-space: nowrap;
+}
+.header-link:hover {
+    background: #6366f1;
+    border-color: #6366f1;
+    color: #fff;
+}
+
+.theme-btn {
+    border: 1px solid var(--color-border);
+    background: var(--color-card);
+    color: var(--color-text);
+    border-radius: 6px;
+    padding: 0.25rem 0.5rem;
+    cursor: pointer;
+    font-size: 1rem;
+    line-height: 1;
+    transition: background 0.2s;
+}
+.theme-btn:hover { background: var(--color-bg); }
+
+/* ── Summary stat cards ─────────────────────────────────────────────────── */
+.summary {
+    display: grid;
+    grid-template-columns: repeat(auto-fit, minmax(140px, 1fr));
+    gap: 1rem;
+    margin-bottom: 1.5rem;
+}
+
+.stat-card {
+    background: var(--color-card);
+    border: 1px solid var(--color-border);
+    border-radius: 12px;
+    padding: 1.1rem 1rem;
+    text-align: center;
+    transition: box-shadow 0.2s, transform 0.2s;
+}
+.stat-card:hover {
+    box-shadow: 0 8px 24px rgb(0 0 0 / 0.1);
+    transform: translateY(-2px);
+}
+
+.stat-icon { font-size: 1.25rem; margin-bottom: 0.25rem; line-height: 1; }
+.stat-value { font-size: 2rem; font-weight: 800; letter-spacing: -0.03em; }
+.stat-label { color: var(--color-muted); font-size: 0.8rem; font-weight: 500; margin-top: 0.1rem; }
+.stat-pass .stat-value { color: var(--color-pass); }
+.stat-fail .stat-value { color: var(--color-fail); }
+.stat-warn .stat-value { color: var(--color-warn); }
+.stat-rate .stat-value {
+    background: linear-gradient(135deg, #6366f1 0%, #8b5cf6 100%);
+    -webkit-background-clip: text;
+    -webkit-text-fill-color: transparent;
+    background-clip: text;
+}
+
+.progress-bar {
+    background: var(--color-border);
+    border-radius: 999px;
+    height: 6px;
+    margin-top: 0.6rem;
+    overflow: hidden;
+}
+
+.progress-fill {
+    background: linear-gradient(90deg, #6366f1, #8b5cf6);
+    height: 100%;
+    transition: width 0.4s ease;
+    border-radius: 999px;
+}
+
+/* ── Chart panels ───────────────────────────────────────────────────────── */
+.charts-section {
+    display: grid;
+    grid-template-columns: 1fr 1fr;
+    gap: 1rem;
+    margin-bottom: 1.5rem;
+}
+
+@media (max-width: 640px) {
+    .charts-section { grid-template-columns: 1fr; }
+}
+
+.chart-panel {
+    background: var(--color-card);
+    border: 1px solid var(--color-border);
+    border-radius: 12px;
+    padding: 1.1rem;
+    min-height: 100px;
+    overflow: hidden;
+    box-shadow: 0 1px 4px rgb(0 0 0 / 0.05);
+}
+
+.chart-panel h2 {
+    font-size: 0.75rem;
+    font-weight: 700;
+    color: var(--color-muted);
+    text-transform: uppercase;
+    letter-spacing: 0.08em;
+    margin-bottom: 0.875rem;
+}
+
+.chart-svg { display: block; width: 100%; overflow: visible; }
+.chart-label { font-size: 11px; fill: var(--color-text); font-family: inherit; }
+.chart-val   { font-size: 11px; fill: var(--color-muted); font-family: inherit; }
+
+/* ── Filter bar (sticky below header) ──────────────────────────────────── */
+.cases-section { margin-bottom: 2rem; }
+
+.filter-bar {
+    display: flex;
+    align-items: center;
+    gap: 0.5rem;
+    flex-wrap: wrap;
+    margin-bottom: 1rem;
+    position: sticky;
+    top: 57px;
+    z-index: 50;
+    background: var(--color-bg);
+    padding: 0.5rem 0;
+}
+
+.filter-btn {
+    padding: 0.375rem 0.875rem;
+    border: 1px solid var(--color-border);
+    background: var(--color-card);
+    color: var(--color-text);
+    border-radius: 4px;
+    cursor: pointer;
+    font-size: 0.875rem;
+    transition: background 0.15s;
+}
+.filter-btn.active {
+    background: var(--color-text);
+    color: var(--color-bg);
+    border-color: transparent;
+}
+
+.search-input {
+    margin-left: auto;
+    padding: 0.375rem 0.75rem;
+    border: 1px solid var(--color-border);
+    border-radius: 4px;
+    background: var(--color-card);
+    color: var(--color-text);
+    font-size: 0.875rem;
+    width: 200px;
+}
+.search-input:focus { outline: 2px solid var(--color-pass); outline-offset: 1px; }
+
+h2 { font-size: 1.25rem; margin-bottom: 1rem; }
+
+/* ── Case list ──────────────────────────────────────────────────────────── */
+.case-list { display: flex; flex-direction: column; gap: 0.5rem; }
+
+.case-item {
+    background: var(--color-card);
+    border: 1px solid var(--color-border);
+    border-radius: 10px;
+    overflow: hidden;
+    transition: box-shadow 0.2s, transform 0.15s;
+}
+.case-item:hover {
+    box-shadow: 0 6px 20px -4px rgb(0 0 0 / 0.12);
+    transform: translateY(-1px);
+}
+
+.case-header {
+    display: flex;
+    align-items: center;
+    gap: 0.75rem;
+    padding: 0.875rem 1rem;
+    cursor: pointer;
+    user-select: none;
+}
+
+.expand-icon {
+    color: var(--color-muted);
+    font-size: 0.65rem;
+    transition: transform 0.2s;
+    flex-shrink: 0;
+}
+.case-item.expanded .expand-icon { transform: rotate(90deg); }
+
+.case-status {
+    width: 22px;
+    height: 22px;
+    border-radius: 50%;
+    display: flex;
+    align-items: center;
+    justify-content: center;
+    font-weight: bold;
+    color: white;
+    font-size: 0.7rem;
+    flex-shrink: 0;
+}
+.case-status.pass  { background: var(--color-pass); }
+.case-status.fail  { background: var(--color-fail); }
+.case-status.error { background: var(--color-warn); }
+
+.case-name { flex: 1; font-weight: 500; }
+
+.case-score {
+    font-family: monospace;
+    font-size: 0.78rem;
+    color: var(--color-muted);
+    background: var(--color-bg);
+    border-radius: 4px;
+    padding: 0.1rem 0.35rem;
+}
+
+.case-duration { color: var(--color-muted); font-size: 0.875rem; }
+
+/* ── Case detail body ───────────────────────────────────────────────────── */
+.case-details {
+    display: none;
+    padding: 0 1rem 1rem;
+    border-top: 1px solid var(--color-border);
+}
+.case-item.expanded .case-details { display: block; }
+
+.detail-section { margin-top: 0.75rem; }
+
+.detail-section summary {
+    cursor: pointer;
+    font-weight: 600;
+    font-size: 0.875rem;
+    color: var(--color-text);
+    padding: 0.25rem 0;
+    user-select: none;
+    list-style: none;
+}
+.detail-section summary::before { content: '▶ '; font-size: 0.6rem; color: var(--color-muted); }
+details[open] > summary::before { content: '▼ '; }
+
+.response-box {
+    background: var(--color-bg);
+    border: 1px solid var(--color-border);
+    border-radius: 4px;
+    padding: 0.75rem;
+    margin-top: 0.5rem;
+    white-space: pre-wrap;
+    word-break: break-word;
+    font-family: monospace;
+    font-size: 0.8rem;
+    max-height: 200px;
+    overflow-y: auto;
+}
+
+.tool-table {
+    width: 100%;
+    border-collapse: collapse;
+    margin-top: 0.5rem;
+    font-size: 0.8rem;
+}
+.tool-table th, .tool-table td {
+    border: 1px solid var(--color-border);
+    padding: 0.4rem 0.5rem;
+    text-align: left;
+}
+.tool-table th { background: var(--color-bg); font-weight: 600; }
+.tool-table td { font-family: monospace; word-break: break-all; }
+
+.trajectory-timeline {
+    margin-top: 0.5rem;
+    padding-left: 1rem;
+    border-left: 2px solid var(--color-border);
+}
+
+.traj-step {
+    padding: 0.2rem 0 0.2rem 0.75rem;
+    font-size: 0.8rem;
+    position: relative;
+}
+.traj-step::before {
+    content: '';
+    position: absolute;
+    left: -0.42rem;
+    top: 0.55rem;
+    width: 8px;
+    height: 8px;
+    border-radius: 50%;
+    background: var(--color-muted);
+}
+.traj-step.node::before { background: #6366f1; }
+.traj-step.tool::before { background: #f59e0b; }
+.traj-label  { font-weight: 600; }
+.traj-detail { color: var(--color-muted); margin-left: 0.25rem; }
+
+.node-box {
+    background: var(--color-bg);
+    border: 1px solid var(--color-border);
+    border-radius: 4px;
+    padding: 0.5rem 0.75rem;
+    margin-top: 0.35rem;
+    font-size: 0.8rem;
+}
+.node-box-title { font-weight: 600; color: #6366f1; }
+.node-box p {
+    margin: 0.25rem 0 0;
+    font-family: monospace;
+    white-space: pre-wrap;
+    word-break: break-word;
+}
+
+/* ── Criterion score bars ───────────────────────────────────────────────── */
+.criterion-list {
+    display: flex;
+    flex-direction: column;
+    gap: 0.25rem;
+    margin-top: 0.5rem;
+}
+
+.criterion-row {
+    display: grid;
+    grid-template-columns: 1.25rem 9rem 1fr 5.5rem 1.25rem;
+    align-items: center;
+    gap: 0.5rem;
+    padding: 0.35rem 0.5rem;
+    background: var(--color-bg);
+    border-radius: 4px;
+}
+
+.criterion-icon  { font-size: 0.875rem; text-align: center; }
+.criterion-name  { font-size: 0.875rem; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; }
+
+.score-bar-wrap {
+    background: var(--color-border);
+    border-radius: 999px;
+    height: 8px;
+    overflow: hidden;
+}
+.score-bar-fill {
+    height: 100%;
+    border-radius: 999px;
+    transition: width 0.3s ease;
+}
+
+.score-value { font-family: monospace; font-size: 0.78rem; color: var(--color-muted); white-space: nowrap; }
+
+.criterion-reason {
+    grid-column: 2 / -1;
+    color: var(--color-muted);
+    font-size: 0.78rem;
+    padding-left: 0.25rem;
+    padding-top: 0.1rem;
+}
+
+/* ── Error box ──────────────────────────────────────────────────────────── */
+.error-message {
+    background: #fef2f2;
+    border: 1px solid #fecaca;
+    border-radius: 4px;
+    padding: 0.75rem;
+    color: #991b1b;
+    margin-top: 0.5rem;
+    font-family: monospace;
+    font-size: 0.875rem;
+}
+[data-theme="dark"] .error-message {
+    background: #2d1515;
+    border-color: #7f1d1d;
+    color: #fca5a5;
+}
+@media (prefers-color-scheme: dark) {
+    :root:not([data-theme]) .error-message {
+        background: #2d1515;
+        border-color: #7f1d1d;
+        color: #fca5a5;
+    }
+}
+
+/* ── Footer ─────────────────────────────────────────────────────────────── */
+footer {
+    margin-top: 2.5rem;
+    padding-top: 1.5rem;
+    border-top: 1px solid var(--color-border);
+}
+
+.footer-inner {
+    display: flex;
+    flex-direction: column;
+    align-items: center;
+    gap: 0.6rem;
+    text-align: center;
+}
+
+.footer-brand {
+    display: flex;
+    align-items: center;
+    gap: 0.4rem;
+    font-size: 0.95rem;
+}
+
+.footer-logo {
+    font-size: 1.1rem;
+    background: linear-gradient(135deg, #6366f1 0%, #8b5cf6 100%);
+    -webkit-background-clip: text;
+    -webkit-text-fill-color: transparent;
+    background-clip: text;
+}
+
+.footer-brand-name {
+    font-weight: 800;
+    background: linear-gradient(135deg, #6366f1 0%, #8b5cf6 100%);
+    -webkit-background-clip: text;
+    -webkit-text-fill-color: transparent;
+    background-clip: text;
+}
+
+.footer-tagline {
+    color: var(--color-muted);
+    font-size: 0.875rem;
+}
+
+.footer-links {
+    display: flex;
+    align-items: center;
+    gap: 0.5rem;
+    flex-wrap: wrap;
+    justify-content: center;
+}
+
+.footer-link {
+    display: inline-flex;
+    align-items: center;
+    gap: 0.25rem;
+    color: #6366f1;
+    text-decoration: none;
+    font-size: 0.875rem;
+    font-weight: 500;
+    transition: color 0.15s;
+}
+.footer-link:hover { color: #8b5cf6; text-decoration: underline; }
+
+.footer-sep { color: var(--color-border); }
+
+.footer-note {
+    color: var(--color-muted);
+    font-size: 0.75rem;
+    max-width: 540px;
+}
+
+    </style>
+</head>
+<body>
+    <div class="container">
+
+        <header class="sticky-header">
+            <div class="header-inner">
+                <div class="brand">
+                    <span class="brand-logo" aria-hidden="true">&#x26A1;</span>
+                    <div class="brand-text">
+                        <span class="brand-name">AgentFlow</span>
+                        <span class="brand-sub">Evaluation Report</span>
+                    </div>
+                    <span class="report-title-chip">s-file</span>
+                </div>
+                <div class="header-meta">
+                    <span class="timestamp">Generated: 2026-06-13 20:47:37</span>
+                    <a class="header-link" href="https://agentflow.10xscale.ai/" target="_blank" rel="noopener" title="Documentation">
+                        &#x1F4D6; Docs
+                    </a>
+                    <a class="header-link" href="https://github.com/10xHub/Agentflow" target="_blank" rel="noopener" title="GitHub Repository">
+                        &#x1F4BE; GitHub
+                    </a>
+                    <button id="theme-toggle" class="theme-btn" aria-label="Toggle dark mode">&#x1F319;</button>
+                </div>
+            </div>
+        </header>
+
+        <section class="summary">
+            <div class="stat-card">
+                <div class="stat-icon">&#x1F4CB;</div>
+                <div class="stat-value">1</div>
+                <div class="stat-label">Total Cases</div>
+            </div>
+            <div class="stat-card stat-pass">
+                <div class="stat-icon">&#x2705;</div>
+                <div class="stat-value">1</div>
+                <div class="stat-label">Passed</div>
+            </div>
+            <div class="stat-card stat-fail">
+                <div class="stat-icon">&#x274C;</div>
+                <div class="stat-value">0</div>
+                <div class="stat-label">Failed</div>
+            </div>
+            <div class="stat-card stat-warn">
+                <div class="stat-icon">&#x26A0;&#xFE0F;</div>
+                <div class="stat-value">0</div>
+                <div class="stat-label">Errors</div>
+            </div>
+            <div class="stat-card stat-rate">
+                <div class="stat-icon">&#x1F4C8;</div>
+                <div class="stat-value">100%</div>
+                <div class="stat-label">Pass Rate</div>
+                <div class="progress-bar">
+                    <div class="progress-fill" style="width: 100%"></div>
+                </div>
+            </div>
+            <div class="stat-card">
+                <div class="stat-icon">&#x23F1;&#xFE0F;</div>
+                <div class="stat-value">0.00s</div>
+                <div class="stat-label">Duration</div>
+            </div>
+
+        </section>
+
+        <section class="charts-section">
+            <div class="chart-panel">
+                <h2>&#x1F4CA; Criterion Breakdown</h2>
+                <div id="criterion-breakdown"></div>
+            </div>
+            <div class="chart-panel">
+                <h2>&#x1F3AF; Score by Case</h2>
+                <div id="case-chart"></div>
+            </div>
+        </section>
+
+        <section class="cases-section">
+            <div class="filter-bar">
+                <button class="filter-btn active" data-filter="all">All</button>
+                <button class="filter-btn" data-filter="pass">&#x2705; Passed</button>
+                <button class="filter-btn" data-filter="fail">&#x274C; Failed</button>
+                <button class="filter-btn" data-filter="error">&#x26A0; Errors</button>
+                <input id="case-search" class="search-input" type="search"
+                       placeholder="&#x1F50D; Search cases&#x2026;" />
+            </div>
+            <div class="case-list">
+            <div class="case-item" data-status="pass" data-score="0.0000">
+                <div class="case-header">
+                    <span class="expand-icon">&#x25BA;</span>
+                    <div class="case-status pass">✓</div>
+                    <span class="case-name">c1</span>
+                    <span class="case-score">Score: 0.00</span>
+                    <span class="case-duration">0.00s</span>
+                </div>
+                <div class="case-details">
+                </div>
+            </div>
+            </div>
+        </section>
+
+        <footer>
+            <div class="footer-inner">
+                <div class="footer-brand">
+                    <span class="footer-logo" aria-hidden="true">&#x26A1;</span>
+                    <span class="footer-brand-name">AgentFlow</span>
+                    <span class="footer-tagline">— Multi-Agent AI Framework by 10xScale</span>
+                </div>
+                <div class="footer-links">
+                    <a class="footer-link" href="https://agentflow.10xscale.ai/" target="_blank" rel="noopener">
+                        &#x1F4D6; Documentation
+                    </a>
+                    <span class="footer-sep" aria-hidden="true">&#x2022;</span>
+                    <a class="footer-link" href="https://github.com/10xHub/Agentflow" target="_blank" rel="noopener">
+                        &#x1F4BE; GitHub
+                    </a>
+                    <span class="footer-sep" aria-hidden="true">&#x2022;</span>
+                    <a class="footer-link" href="https://pypi.org/project/10xscale-agentflow/" target="_blank" rel="noopener">
+                        &#x1F4E6; PyPI
+                    </a>
+                </div>
+                <p class="footer-note">Generated by the AgentFlow Evaluation Framework &nbsp;&middot;&nbsp; Report is self-contained, no internet connection required to view.</p>
+            </div>
+        </footer>
+
+    </div>
+    <script>
+
+(function () {
+    'use strict';
+
+    /* ── Dark mode ─────────────────────────────────────────────────────── */
+    const root = document.documentElement;
+    const themeBtn = document.getElementById('theme-toggle');
+    const mq = window.matchMedia('(prefers-color-scheme: dark)');
+
+    function applyTheme(dark) {
+        root.setAttribute('data-theme', dark ? 'dark' : 'light');
+        if (themeBtn) { themeBtn.textContent = dark ? '\u2600\ufe0f' : '\ud83c\udf19'; }
+    }
+    applyTheme(mq.matches);
+    if (themeBtn) {
+        themeBtn.addEventListener('click', function () {
+            applyTheme(root.getAttribute('data-theme') !== 'dark');
+        });
+    }
+
+    /* ── Case expand / collapse ────────────────────────────────────────── */
+    document.querySelectorAll('.case-header').forEach(function (header) {
+        header.addEventListener('click', function () {
+            header.closest('.case-item').classList.toggle('expanded');
+        });
+    });
+
+    /* ── Filter buttons ────────────────────────────────────────────────── */
+    var activeFilter = 'all';
+    document.querySelectorAll('.filter-btn').forEach(function (btn) {
+        btn.addEventListener('click', function () {
+            document.querySelectorAll('.filter-btn').forEach(function (b) {
+                b.classList.remove('active');
+            });
+            btn.classList.add('active');
+            activeFilter = btn.dataset.filter;
+            applyFilters();
+        });
+    });
+
+    /* ── Search ────────────────────────────────────────────────────────── */
+    var searchQuery = '';
+    var searchInput = document.getElementById('case-search');
+    if (searchInput) {
+        searchInput.addEventListener('input', function () {
+            searchQuery = searchInput.value.toLowerCase();
+            applyFilters();
+        });
+    }
+
+    function applyFilters() {
+        document.querySelectorAll('.case-item').forEach(function (item) {
+            var statusOk = activeFilter === 'all' || item.dataset.status === activeFilter;
+            var nameEl = item.querySelector('.case-name');
+            var name = nameEl ? nameEl.textContent.toLowerCase() : '';
+            var searchOk = !searchQuery || name.indexOf(searchQuery) !== -1;
+            item.style.display = (statusOk && searchOk) ? '' : 'none';
+        });
+        renderCaseChart();
+    }
+
+    /* ── SVG helpers ───────────────────────────────────────────────────── */
+    var SVG_NS = 'http://www.w3.org/2000/svg';
+
+    function makeSvg(w, h) {
+        var svg = document.createElementNS(SVG_NS, 'svg');
+        svg.setAttribute('width', w);
+        svg.setAttribute('height', h);
+        svg.setAttribute('class', 'chart-svg');
+        svg.setAttribute('aria-hidden', 'true');
+        return svg;
+    }
+
+    function makeRect(x, y, w, h, fill) {
+        var r = document.createElementNS(SVG_NS, 'rect');
+        r.setAttribute('x', x);
+        r.setAttribute('y', y);
+        r.setAttribute('width', Math.max(0, w));
+        r.setAttribute('height', h);
+        r.setAttribute('fill', fill);
+        r.setAttribute('rx', 3);
+        return r;
+    }
+
+    function makeText(x, y, txt, anchor, cls) {
+        var t = document.createElementNS(SVG_NS, 'text');
+        t.setAttribute('x', x);
+        t.setAttribute('y', y);
+        t.setAttribute('text-anchor', anchor);
+        if (cls) { t.setAttribute('class', cls); }
+        t.textContent = txt;
+        return t;
+    }
+
+    function scoreColor(score) {
+        var hue = Math.round(score * 120);
+        return 'hsl(' + hue + ',65%,45%)';
+    }
+
+    function truncate(str, maxLen) {
+        return str.length > maxLen ? str.slice(0, maxLen - 1) + '\u2026' : str;
+    }
+
+    /* ── Criterion breakdown chart ──────────────────────────────────────── */
+    function renderCriterionChart() {
+        var container = document.getElementById('criterion-breakdown');
+        if (!container) { return; }
+        container.innerHTML = '';
+
+        var critMap = {};
+        document.querySelectorAll('.criterion-row').forEach(function (row) {
+            var name = row.dataset.criterion;
+            var score = parseFloat(row.dataset.score);
+            if (!name || isNaN(score)) { return; }
+            if (!critMap[name]) { critMap[name] = { total: 0, count: 0 }; }
+            critMap[name].total += score;
+            critMap[name].count += 1;
+        });
+
+        var entries = Object.keys(critMap).map(function (name) {
+            var d = critMap[name];
+            return { name: name, avg: d.total / d.count };
+        }).sort(function (a, b) { return b.avg - a.avg; });
+
+        if (!entries.length) {
+            container.textContent = 'No criteria data.';
+            return;
+        }
+
+        var BAR_H = 22, GAP = 6;
+        var PAD = { top: 8, left: 140, right: 48, bottom: 8 };
+        var W = container.clientWidth || 260;
+        var availW = W - PAD.left - PAD.right;
+        var H = entries.length * (BAR_H + GAP) + PAD.top + PAD.bottom;
+        var svg = makeSvg(W, H);
+
+        entries.forEach(function (entry, i) {
+            var y = PAD.top + i * (BAR_H + GAP);
+            var barW = entry.avg * availW;
+            /* track */
+            var bg = makeRect(PAD.left, y, availW, BAR_H, 'var(--color-border)');
+            bg.style.opacity = '0.3';
+            svg.appendChild(bg);
+            /* fill */
+            svg.appendChild(makeRect(PAD.left, y, barW, BAR_H, scoreColor(entry.avg)));
+            /* labels */
+            svg.appendChild(makeText(PAD.left - 6, y + BAR_H * 0.68, truncate(entry.name, 20), 'end', 'chart-label'));
+            svg.appendChild(makeText(PAD.left + barW + 5, y + BAR_H * 0.68, entry.avg.toFixed(2), 'start', 'chart-val'));
+        });
+
+        container.appendChild(svg);
+    }
+
+    /* ── Score by case chart ────────────────────────────────────────────── */
+    function renderCaseChart() {
+        var container = document.getElementById('case-chart');
+        if (!container) { return; }
+        container.innerHTML = '';
+
+        var items = Array.prototype.slice.call(document.querySelectorAll('.case-item')).filter(function (el) {
+            return el.style.display !== 'none';
+        });
+
+        if (!items.length) {
+            container.textContent = 'No cases to display.';
+            return;
+        }
+
+        var BAR_H = 18, GAP = 6;
+        var PAD = { top: 8, left: 140, right: 48, bottom: 8 };
+        var W = container.clientWidth || 260;
+        var availW = W - PAD.left - PAD.right;
+        var H = items.length * (BAR_H + GAP) + PAD.top + PAD.bottom;
+        var svg = makeSvg(W, H);
+
+        items.forEach(function (item, i) {
+            var nameEl = item.querySelector('.case-name');
+            var rawName = nameEl ? nameEl.textContent : '';
+            var name = truncate(rawName, 22);
+            var score = parseFloat(item.dataset.score || '0');
+            var status = item.dataset.status;
+            var y = PAD.top + i * (BAR_H + GAP);
+            var barW = score * availW;
+            var fill = status === 'pass'  ? 'var(--color-pass)'
+                     : status === 'error' ? 'var(--color-warn)'
+                     : 'var(--color-fail)';
+            /* track */
+            var bg = makeRect(PAD.left, y, availW, BAR_H, 'var(--color-border)');
+            bg.style.opacity = '0.2';
+            svg.appendChild(bg);
+            /* fill */
+            svg.appendChild(makeRect(PAD.left, y, barW, BAR_H, fill));
+            /* labels */
+            svg.appendChild(makeText(PAD.left - 6, y + BAR_H * 0.75, name, 'end', 'chart-label'));
+            svg.appendChild(makeText(PAD.left + barW + 5, y + BAR_H * 0.75, score.toFixed(2), 'start', 'chart-val'));
+        });
+
+        container.appendChild(svg);
+    }
+
+    /* ── Initial render ─────────────────────────────────────────────────── */
+    renderCriterionChart();
+    renderCaseChart();
+
+    /* Re-render on resize */
+    var _rTimer;
+    window.addEventListener('resize', function () {
+        clearTimeout(_rTimer);
+        _rTimer = setTimeout(function () {
+            renderCriterionChart();
+            renderCaseChart();
+        }, 150);
+    });
+
+}());
+
+    </script>
+</body>
+</html>
\ No newline at end of file
diff --git a/eval_reports/s-file_20260613_204737.json b/eval_reports/s-file_20260613_204737.json
new file mode 100644
index 00000000..5ea4f953
--- /dev/null
+++ b/eval_reports/s-file_20260613_204737.json
@@ -0,0 +1,123 @@
+{
+  "eval_set_id": "s-file",
+  "eval_set_name": "",
+  "results": [
+    {
+      "eval_id": "c1",
+      "name": "",
+      "passed": true,
+      "criterion_results": [],
+      "actual_trajectory": [],
+      "actual_tool_calls": [],
+      "actual_response": "",
+      "messages": [],
+      "node_responses": [],
+      "node_visits": [],
+      "duration_seconds": 0.0,
+      "error": null,
+      "metadata": {},
+      "turn_results": [],
+      "token_usage": {
+        "input_tokens": 0,
+        "output_tokens": 0,
+        "cache_read_tokens": 0,
+        "cache_creation_tokens": 0,
+        "total_tokens": 0
+      },
+      "agent_token_usage": {
+        "input_tokens": 0,
+        "output_tokens": 0,
+        "cache_read_tokens": 0,
+        "cache_creation_tokens": 0,
+        "total_tokens": 0
+      },
+      "node_details": []
+    }
+  ],
+  "summary": {
+    "total_cases": 1,
+    "passed_cases": 1,
+    "failed_cases": 0,
+    "error_cases": 0,
+    "pass_rate": 1.0,
+    "avg_duration_seconds": 0.0,
+    "total_duration_seconds": 0.0,
+    "criterion_stats": {},
+    "total_token_usage": {
+      "input_tokens": 0,
+      "output_tokens": 0,
+      "cache_read_tokens": 0,
+      "cache_creation_tokens": 0,
+      "total_tokens": 0
+    },
+    "per_case_token_usage": {
+      "c1": {
+        "input_tokens": 0,
+        "output_tokens": 0,
+        "cache_read_tokens": 0,
+        "cache_creation_tokens": 0,
+        "total_tokens": 0
+      }
+    },
+    "avg_tokens_per_case": 0.0
+  },
+  "config_used": {
+    "criteria": {
+      "tool_name_match": null,
+      "trajectory": {
+        "threshold": 1.0,
+        "match_type": "EXACT",
+        "judge_model": "gemini-2.5-flash",
+        "num_samples": 3,
+        "rubrics": [],
+        "keywords": [],
+        "check_args": false,
+        "enabled": true,
+        "api_style": "responses"
+      },
+      "node_order": null,
+      "response_match": {
+        "threshold": 0.8,
+        "match_type": "EXACT",
+        "judge_model": "gemini-2.5-flash",
+        "num_samples": 3,
+        "rubrics": [],
+        "keywords": [],
+        "check_args": false,
+        "enabled": true,
+        "api_style": "responses"
+      },
+      "rouge_match": null,
+      "contains_keywords": null,
+      "llm_judge": null,
+      "rubric_based": null,
+      "factual_accuracy": null,
+      "hallucination": null,
+      "safety": null,
+      "simulation_goals": null
+    },
+    "user_simulator_config": null,
+    "parallel": false,
+    "max_concurrency": 4,
+    "timeout": 300.0,
+    "verbose": false,
+    "mock_mode": false,
+    "reporter": {
+      "enabled": true,
+      "output_dir": "eval_reports",
+      "console": true,
+      "json_report": true,
+      "html": true,
+      "junit_xml": false,
+      "verbose": true,
+      "include_details": true,
+      "include_trajectory": true,
+      "include_node_responses": true,
+      "include_actual_response": true,
+      "include_tool_call_details": true,
+      "timestamp_files": true
+    }
+  },
+  "timestamp": 1781362057.5080197,
+  "metadata": {}
+}
\ No newline at end of file
diff --git a/examples/evaluation/test1/test_weather_agent.py b/examples/evaluation/test1/test_weather_agent.py
index c458c390..d08d9c99 100644
--- a/examples/evaluation/test1/test_weather_agent.py
+++ b/examples/evaluation/test1/test_weather_agent.py
@@ -12,9 +12,8 @@
 
 # import pytest
 
-# from agentflow.evaluation.config.eval_config import CriterionConfig, EvalConfig, MatchType
-# from agentflow.evaluation.evaluator import AgentEvaluator
-# from agentflow.state.message import Message
+# from agentflow.qa.evaluation import AgentEvaluator, CriterionConfig, EvalConfig, MatchType
+# from agentflow.core.state import Message
 
 # from .samples import CAPITAL_QUESTION, LONDON, NYC
 
diff --git a/examples/evaluation/test_graph/__init__.py b/examples/evaluation/test_graph/__init__.py
index 4a554b69..2f35cec0 100644
--- a/examples/evaluation/test_graph/__init__.py
+++ b/examples/evaluation/test_graph/__init__.py
@@ -57,7 +57,7 @@ def get_forecast(location: str, days: int = 3) -> str:
                 ),
             },
         ],
-        tool_node_name="TOOL",
+        tool_node="TOOL",
     )
 
     # ── Routing ──────────────────────────────────────────────────────
diff --git a/examples/github-mcp/git_mcp.py b/examples/github-mcp/git_mcp.py
index ce6617e3..f233e5ac 100644
--- a/examples/github-mcp/git_mcp.py
+++ b/examples/github-mcp/git_mcp.py
@@ -28,7 +28,7 @@
 
 client_http = Client(config)
 
-tool_node = ToolNode(functions=[], client=client_http)
+tool_node = ToolNode([], client=client_http)
 
 
 main_agent = Agent(
diff --git a/examples/tool-decorator/README.md b/examples/tool-decorator/README.md
index 0a1d8e75..1731489e 100644
--- a/examples/tool-decorator/README.md
+++ b/examples/tool-decorator/README.md
@@ -77,7 +77,7 @@ def advanced_function(x: int) -> int:
 ### Tag Filtering
 
 ```python
-from agentflow.graph.tool_node import ToolNode
+from agentflow.core.graph import ToolNode
 
 # Create tools with different tags
 @tool(name="read_tool", tags=["database", "read"])
diff --git a/tests/test_docs_imports.py b/tests/test_docs_imports.py
new file mode 100644
index 00000000..49d62401
--- /dev/null
+++ b/tests/test_docs_imports.py
@@ -0,0 +1,115 @@
+"""Doc/example import guardrail.
+
+Prevents regressions of the pre-refactor import drift: README and ``examples/`` must not
+reference module paths that were removed in the ``core/`` / ``storage/`` / ``runtime/`` / ``qa/``
+restructure, and must not use APIs that do not exist (``Message.from_text``,
+``ToolNode(functions=...)``, ``Agent(tool_node_name=...)``).
+
+Two layers:
+  1. Static scan of every ``agentflow.*`` import in README + examples against a denylist of
+     removed top-level shims, plus a scan for known-bad API call patterns.
+  2. A live check that the canonical symbols the README now advertises are importable and real.
+
+The static scan needs no optional dependencies and is the authoritative regression guard.
+"""
+
+from __future__ import annotations
+
+import re
+from pathlib import Path
+
+
+REPO_ROOT = Path(__file__).resolve().parents[1]
+
+# Top-level module prefixes removed in the package restructure. Any import that starts with one
+# of these (followed by "." or end of token) is a dead path. Note that the canonical paths
+# (agentflow.core.state, agentflow.core.graph, ...) do NOT start with any of these.
+DEAD_PREFIXES = (
+    "agentflow.graph",
+    "agentflow.state",
+    "agentflow.checkpointer",
+    "agentflow.evaluation",
+    "agentflow.skills",
+    "agentflow.testing",
+    "agentflow.adapters",
+    "agentflow.publisher",
+)
+
+# API call patterns that reference symbols/keywords that do not exist.
+BAD_API_PATTERNS = {
+    r"\bMessage\.from_text\s*\(": "Message.from_text does not exist; use Message.text_message",
+    r"\bToolNode\s*\(\s*functions\s*=": "ToolNode takes `tools` (positional), not `functions=`",
+    r"\btool_node_name\s*=": "Agent uses `tool_node=`, not `tool_node_name=`",
+}
+
+_IMPORT_RE = re.compile(r"^\s*(?:from\s+(agentflow[\w.]*)\s+import|import\s+(agentflow[\w.]*))")
+_PY_FENCE_RE = re.compile(r"```(?:python|py)\s*\n(.*?)```", re.DOTALL)
+
+
+def _doc_files() -> list[Path]:
+    files = [REPO_ROOT / "README.md"]
+    examples = REPO_ROOT / "examples"
+    if examples.is_dir():
+        files += sorted(examples.rglob("*.md"))
+        files += sorted(examples.rglob("*.py"))
+    return [f for f in files if f.is_file()]
+
+
+def _code_text(path: Path) -> str:
+    """Return the Python source contained in a file (fenced blocks for .md, whole file for .py)."""
+    text = path.read_text(encoding="utf-8", errors="replace")
+    if path.suffix == ".md":
+        return "\n".join(_PY_FENCE_RE.findall(text))
+    return text
+
+
+def _agentflow_imports(code: str) -> list[str]:
+    """Yield the imported ``agentflow.*`` module path for each non-commented import line."""
+    mods = []
+    for line in code.splitlines():
+        if line.lstrip().startswith("#"):
+            continue
+        m = _IMPORT_RE.match(line)
+        if m:
+            mods.append(m.group(1) or m.group(2))
+    return mods
+
+
+def _is_dead(mod: str) -> bool:
+    return any(mod == p or mod.startswith(p + ".") for p in DEAD_PREFIXES)
+
+
+def test_no_dead_import_paths_in_docs():
+    """No README/example code references a removed top-level module path."""
+    violations = []
+    for f in _doc_files():
+        for mod in _agentflow_imports(_code_text(f)):
+            if _is_dead(mod):
+                violations.append(f"{f.relative_to(REPO_ROOT)}: {mod}")
+    assert not violations, (
+        "Dead import paths found (use agentflow.core.* / agentflow.storage.* / agentflow.qa.*):\n"
+        + "\n".join(violations)
+    )
+
+
+def test_no_nonexistent_api_patterns_in_docs():
+    """No README/example code uses an API symbol/keyword that does not exist."""
+    violations = []
+    for f in _doc_files():
+        code = _code_text(f)
+        for pattern, why in BAD_API_PATTERNS.items():
+            if re.search(pattern, code):
+                violations.append(f"{f.relative_to(REPO_ROOT)}: {why}")
+    assert not violations, "Nonexistent API usage found:\n" + "\n".join(violations)
+
+
+def test_canonical_readme_symbols_are_real():
+    """The canonical symbols the README advertises import and exist."""
+    from agentflow.core.graph import Agent, StateGraph, ToolNode  # noqa: F401
+    from agentflow.core.state import AgentState, Message  # noqa: F401
+    from agentflow.storage.checkpointer import InMemoryCheckpointer  # noqa: F401
+    from agentflow.utils import ResponseGranularity, convert_messages  # noqa: F401
+    from agentflow.utils.constants import END  # noqa: F401
+
+    assert hasattr(Message, "text_message")
+    assert not hasattr(Message, "from_text")
diff --git a/tests/test_import_order.py b/tests/test_import_order.py
new file mode 100644
index 00000000..7b9351d4
--- /dev/null
+++ b/tests/test_import_order.py
@@ -0,0 +1,66 @@
+"""Import-order regression guard.
+
+``agentflow.core.graph`` imports back into ``agentflow.utils`` and
+``agentflow.storage.checkpointer``. Historically that made those modules unimportable as the
+*first* import in a fresh interpreter (``ImportError: ... partially initialized module``), because
+``agentflow.core`` eagerly pulled in ``graph``. ``graph`` is now loaded lazily (PEP 562
+``__getattr__`` in ``agentflow/core/__init__.py``) so every public entry point imports cleanly in
+any order.
+
+Each case runs in a *fresh* subprocess — importing in-process would not catch the bug once pytest
+has already loaded ``agentflow.core``.
+"""
+
+from __future__ import annotations
+
+import subprocess
+import sys
+
+import pytest
+
+
+# Public entry points that must import cleanly as the very first import in a fresh interpreter.
+FIRST_IMPORTS = [
+    "import agentflow.utils",
+    "from agentflow.utils import CallbackManager, convert_messages, tool",
+    "import agentflow.storage",
+    "import agentflow.storage.checkpointer",
+    "from agentflow.storage.checkpointer import InMemoryCheckpointer, BaseCheckpointer",
+    "import agentflow.core",
+    "from agentflow.core import StateGraph, Agent, ToolNode, CompiledGraph, AgentState, Message",
+    "from agentflow.core.graph import Agent, StateGraph, ToolNode, CompiledGraph",
+    "from agentflow.core.state import AgentState, Message",
+    "import agentflow.runtime.publisher",
+    "import agentflow.qa.evaluation",
+    "import agentflow.qa.testing",
+]
+
+
+@pytest.mark.parametrize("statement", FIRST_IMPORTS)
+def test_importable_as_first_import(statement: str):
+    """The statement succeeds when it is the only thing a fresh interpreter imports."""
+    result = subprocess.run(
+        [sys.executable, "-c", statement],
+        capture_output=True,
+        text=True,
+        timeout=60,
+    )
+    assert result.returncode == 0, (
+        f"`{statement}` failed as a first import:\n{result.stderr}"
+    )
+
+
+def test_lazy_graph_symbol_identity():
+    """The lazily-resolved aggregate symbol is the same object as the direct submodule symbol."""
+    code = (
+        "from agentflow.core import StateGraph as A\n"
+        "from agentflow.core.graph import StateGraph as B\n"
+        "assert A is B, 'aggregate symbol is not the submodule symbol'\n"
+    )
+    result = subprocess.run(
+        [sys.executable, "-c", code],
+        capture_output=True,
+        text=True,
+        timeout=60,
+    )
+    assert result.returncode == 0, result.stderr

From dd5a7d80dc60fedc7b00bb2b47454f3ed85ffb80 Mon Sep 17 00:00:00 2001
From: Shudipto Trafder <shudiptotrafder@gmail.com>
Date: Sat, 13 Jun 2026 21:10:26 +0600
Subject: [PATCH 2/2] feat: enhance provider detection and model resolution in
 Agent class

---
 .gitignore                                    |  2 +
 agentflow/core/graph/agent.py                 | 17 ++--
 .../core/graph/agent_internal/providers.py    | 18 +++-
 agentflow/core/llm/client_factory.py          | 45 +++++++++-
 tests/graph/test_agent_internal.py            | 85 +++++++++++++++++++
 5 files changed, 152 insertions(+), 15 deletions(-)

diff --git a/.gitignore b/.gitignore
index b2be357e..3ca3c960 100644
--- a/.gitignore
+++ b/.gitignore
@@ -84,3 +84,5 @@ site
 
 graphify-out/
 service_account.json
+
+eval_reports/
\ No newline at end of file
diff --git a/agentflow/core/graph/agent.py b/agentflow/core/graph/agent.py
index 8a6ab099..f175f902 100644
--- a/agentflow/core/graph/agent.py
+++ b/agentflow/core/graph/agent.py
@@ -264,11 +264,6 @@ class MyState(AgentState):
             **kwargs,
         )
 
-        # check user sending model and provider as prefix, if provider is not explicitly provided
-        if "/" in model and provider is None:
-            provider, model = model.split("/", 1)
-            self.model = model
-
         # Store output type
         self.output_type = output_type.lower()
         self.output_schema = output_schema
@@ -276,15 +271,17 @@ class MyState(AgentState):
 
         # Determine provider; self.llm_kwargs is set by super().__init__ and is
         # already available here for _create_client().
+        self.base_url = base_url
         if provider is not None:
+            # Provider explicitly supplied — trust it as-is.
             self.provider = provider.lower()
-            self.base_url = base_url
             self.client = self._create_client(self.provider, base_url, use_vertex_ai)
         else:
-            # Auto-detect provider from model name
-            self.provider = self._detect_provider_from_model(model, use_vertex_ai)
-            self.base_url = base_url
-            self.client = self._create_client(self.provider, base_url)
+            # Resolve provider (and strip a recognised ``provider/`` prefix) from
+            # the model string. Unknown prefixes resolve to ``openai`` and keep
+            # the full model name (e.g. OpenAI-compatible/self-hosted models).
+            self.provider, self.model = self._resolve_provider_and_model(model, use_vertex_ai)
+            self.client = self._create_client(self.provider, base_url, use_vertex_ai)
 
         # Validate that provider supports the output type
         self._validate_output_type()
diff --git a/agentflow/core/graph/agent_internal/providers.py b/agentflow/core/graph/agent_internal/providers.py
index b51cc49e..4f2627f2 100644
--- a/agentflow/core/graph/agent_internal/providers.py
+++ b/agentflow/core/graph/agent_internal/providers.py
@@ -5,7 +5,11 @@
 import logging
 from typing import Any, Protocol
 
-from agentflow.core.llm.client_factory import create_llm_client, detect_provider
+from agentflow.core.llm.client_factory import (
+    create_llm_client,
+    detect_provider,
+    resolve_provider_and_model,
+)
 
 from .constants import (
     CLIENT_CONSTRUCTOR_KWARGS,
@@ -53,6 +57,18 @@ def _detect_provider_from_model(self, model: str, use_vertex_ai: bool = False) -
         """Infer the provider from the model name when not explicitly supplied."""
         return detect_provider(model, use_vertex_ai=use_vertex_ai)
 
+    def _resolve_provider_and_model(
+        self, model: str, use_vertex_ai: bool = False
+    ) -> tuple[str, str]:
+        """Resolve a model string into a ``(provider, model)`` pair.
+
+        Recognised ``provider/`` prefixes (``gemini``, ``google``, ``openai``,
+        ``gpt``) select the provider and are stripped from the model name.
+        Unknown prefixes are kept intact and resolve to the ``openai`` provider
+        so OpenAI-compatible / self-hosted models work out of the box.
+        """
+        return resolve_provider_and_model(model, use_vertex_ai=use_vertex_ai)
+
     def _create_google_vertex_ai_client(self) -> Any:
         return create_llm_client("google", use_vertex_ai=True)
 
diff --git a/agentflow/core/llm/client_factory.py b/agentflow/core/llm/client_factory.py
index 6ec2f5af..35cdfb35 100644
--- a/agentflow/core/llm/client_factory.py
+++ b/agentflow/core/llm/client_factory.py
@@ -14,6 +14,16 @@
 
 logger = logging.getLogger("agentflow.llm")
 
+# Recognised ``provider/`` prefixes mapped to the concrete provider the client
+# factory can build. Anything not listed here is an unknown prefix and resolves
+# to ``"openai"`` (the OpenAI SDK is used for OpenAI-compatible endpoints).
+_PROVIDER_PREFIXES = {
+    "gemini": "google",
+    "google": "google",
+    "openai": "openai",
+    "gpt": "openai",
+}
+
 # Keys allowed in the AsyncOpenAI constructor but NOT in per-request calls.
 _CLIENT_CONSTRUCTOR_KWARGS = frozenset(
     {
@@ -44,10 +54,8 @@ def detect_provider(model: str, use_vertex_ai: bool = False) -> str:
 
     if "/" in model:
         prefix = model.split("/", 1)[0].lower()
-        if prefix in ("gemini", "google"):
-            return "google"
-        if prefix in ("openai", "gpt"):
-            return "openai"
+        if prefix in _PROVIDER_PREFIXES:
+            return _PROVIDER_PREFIXES[prefix]
         # Unknown prefix — fall through to name-based detection using the suffix
         model = model.split("/", 1)[1]
 
@@ -64,6 +72,35 @@ def detect_provider(model: str, use_vertex_ai: bool = False) -> str:
     return "openai"
 
 
+def resolve_provider_and_model(
+    model: str, use_vertex_ai: bool = False
+) -> tuple[str, str]:
+    """Resolve a model string into a concrete ``(provider, model)`` pair.
+
+    Unlike :func:`detect_provider`, this also returns the model name that should
+    be sent to the provider. A *recognised* ``provider/`` prefix (e.g.
+    ``"gemini/..."``, ``"openai/..."``) is stripped, since the provider is
+    selected from the prefix. An *unrecognised* prefix is kept intact: it may be
+    an OpenAI-compatible / HuggingFace-style identifier (e.g.
+    ``"meta-llama/Llama-3-70b"``) where the slash is part of the real model name.
+    Such models always resolve to the ``"openai"`` provider.
+
+    Args:
+        model: Model identifier, optionally prefixed with ``"provider/"``.
+        use_vertex_ai: When True, always selects the ``"google"`` provider.
+
+    Returns:
+        A ``(provider, model)`` tuple where provider is ``"google"`` or
+        ``"openai"``.
+    """
+    if "/" in model:
+        prefix, rest = model.split("/", 1)
+        if prefix.lower() in _PROVIDER_PREFIXES:
+            return detect_provider(model, use_vertex_ai=use_vertex_ai), rest
+
+    return detect_provider(model, use_vertex_ai=use_vertex_ai), model
+
+
 def create_llm_client(
     provider: str,
     *,
diff --git a/tests/graph/test_agent_internal.py b/tests/graph/test_agent_internal.py
index ca4e4e82..2bfdfa52 100644
--- a/tests/graph/test_agent_internal.py
+++ b/tests/graph/test_agent_internal.py
@@ -281,6 +281,56 @@ def test_deepseek_defaults_to_openai(self):
         agent = _make_openai_agent()
         assert agent._detect_provider_from_model("deepseek-chat") == "openai"
 
+    def test_unknown_prefix_falls_back_to_openai(self):
+        agent = _make_openai_agent()
+        assert agent._detect_provider_from_model("ollama/llama3") == "openai"
+        assert agent._detect_provider_from_model("anthropic/claude-3") == "openai"
+
+
+class TestResolveProviderAndModel:
+    """``_resolve_provider_and_model`` returns ``(provider, model)``: it strips
+    recognised provider aliases and defaults unknown prefixes to openai."""
+
+    def test_gemini_alias_maps_to_google(self):
+        agent = _make_openai_agent()
+        assert agent._resolve_provider_and_model("gemini/gemini-2.5-flash") == (
+            "google",
+            "gemini-2.5-flash",
+        )
+
+    def test_google_alias_maps_to_google(self):
+        agent = _make_openai_agent()
+        assert agent._resolve_provider_and_model("google/gemini-2.0-flash") == (
+            "google",
+            "gemini-2.0-flash",
+        )
+
+    def test_openai_alias_maps_to_openai(self):
+        agent = _make_openai_agent()
+        assert agent._resolve_provider_and_model("openai/gpt-4o") == ("openai", "gpt-4o")
+
+    def test_gpt_alias_maps_to_openai(self):
+        agent = _make_openai_agent()
+        assert agent._resolve_provider_and_model("gpt/gpt-4o") == ("openai", "gpt-4o")
+
+    def test_unknown_prefix_defaults_to_openai_and_keeps_full_model(self):
+        agent = _make_openai_agent()
+        assert agent._resolve_provider_and_model("meta-llama/Llama-3-70b") == (
+            "openai",
+            "meta-llama/Llama-3-70b",
+        )
+
+    def test_bare_unknown_model_defaults_to_openai(self):
+        agent = _make_openai_agent()
+        assert agent._resolve_provider_and_model("llama3:70b") == ("openai", "llama3:70b")
+
+    def test_use_vertex_ai_forces_google(self):
+        agent = _make_openai_agent()
+        assert agent._resolve_provider_and_model("llama3:70b", use_vertex_ai=True) == (
+            "google",
+            "llama3:70b",
+        )
+
 
 class TestValidateOutputType:
     def test_valid_text_type_does_not_raise(self):
@@ -1955,6 +2005,41 @@ def test_unknown_model_without_provider_auto_detects_openai(self):
             agent = Agent(model="llama3:70b", reasoning_config=None)
         assert agent.provider == "openai"
 
+    def test_gemini_slash_prefix_maps_to_google_provider(self):
+        """The ``gemini/`` alias must resolve to the ``google`` provider."""
+        with patch.object(Agent, "_create_client", return_value=MagicMock()):
+            agent = Agent(model="gemini/gemini-2.5-flash", reasoning_config=None)
+        assert agent.provider == "google"
+        assert agent.model == "gemini-2.5-flash"
+
+    def test_gpt_slash_prefix_maps_to_openai_provider(self):
+        """The ``gpt/`` alias must resolve to the ``openai`` provider."""
+        with patch.object(Agent, "_create_client", return_value=MagicMock()):
+            agent = Agent(model="gpt/gpt-4o", reasoning_config=None)
+        assert agent.provider == "openai"
+        assert agent.model == "gpt-4o"
+
+    def test_unknown_prefix_resolves_to_openai_and_keeps_full_model(self):
+        """An unrecognised prefix must default to openai, not google, and keep
+        the full model string (it may be an OpenAI-compatible / HF-style name)."""
+        with patch.object(Agent, "_create_client", return_value=MagicMock()):
+            agent = Agent(model="meta-llama/Llama-3-70b", reasoning_config=None)
+        assert agent.provider == "openai"
+        assert agent.model == "meta-llama/Llama-3-70b"
+
+    def test_anthropic_prefix_resolves_to_openai(self):
+        """Claude via an OpenAI-compatible endpoint should not select google."""
+        with patch.object(Agent, "_create_client", return_value=MagicMock()):
+            agent = Agent(model="anthropic/claude-3", reasoning_config=None)
+        assert agent.provider == "openai"
+        assert agent.model == "anthropic/claude-3"
+
+    def test_ollama_prefix_resolves_to_openai(self):
+        with patch.object(Agent, "_create_client", return_value=MagicMock()):
+            agent = Agent(model="ollama/llama3", reasoning_config=None)
+        assert agent.provider == "openai"
+        assert agent.model == "ollama/llama3"
+
     # ── reasoning config normalization ────────────────────────────────────
 
     def test_default_sentinel_produces_medium_effort(self):