From 58ebd5499d08355e66b85bae4f995ef2d83fadbd Mon Sep 17 00:00:00 2001
From: Luca Belli <129434630+sator-labs@users.noreply.github.com>
Date: Mon, 15 Jun 2026 09:38:03 -0700
Subject: [PATCH 1/4] first draft

---
 .pre-commit-config.yaml                      |   8 -
 .pre-commit-scripts/sync-claude-to-agents.sh |  71 -----
 AGENTS.md                                    | 269 +++++++++----------
 CLAUDE.md                                    | 197 +++++---------
 docs/pre-commit-hooks.md                     |  38 +--
 5 files changed, 198 insertions(+), 385 deletions(-)
 delete mode 100755 .pre-commit-scripts/sync-claude-to-agents.sh

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 653988d1..6ba90604 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -6,11 +6,3 @@ repos:
         args: [--fix]
       - id: ruff-format
 
-  - repo: local
-    hooks:
-      - id: sync-claude-to-agents
-        name: Sync CLAUDE.md to AGENTS.md
-        entry: .pre-commit-scripts/sync-claude-to-agents.sh
-        language: script
-        files: ^CLAUDE\.md$
-        pass_filenames: false
diff --git a/.pre-commit-scripts/sync-claude-to-agents.sh b/.pre-commit-scripts/sync-claude-to-agents.sh
deleted file mode 100755
index 043cd014..00000000
--- a/.pre-commit-scripts/sync-claude-to-agents.sh
+++ /dev/null
@@ -1,71 +0,0 @@
-#!/bin/bash
-# Pre-commit hook to sync CLAUDE.md to AGENTS.md
-# Fails if AGENTS.md exists and differs from CLAUDE.md
-
-set -e
-
-CLAUDE_FILE="CLAUDE.md"
-AGENTS_FILE="AGENTS.md"
-
-# Check if CLAUDE.md exists
-if [ ! -f "$CLAUDE_FILE" ]; then
-    echo "Error: $CLAUDE_FILE not found"
-    exit 1
-fi
-
-# If AGENTS.md doesn't exist, create it
-if [ ! -f "$AGENTS_FILE" ]; then
-    echo "Creating $AGENTS_FILE from $CLAUDE_FILE"
-    cp "$CLAUDE_FILE" "$AGENTS_FILE"
-    git add "$AGENTS_FILE"
-    exit 0
-fi
-
-# If AGENTS.md exists, check if it differs from CLAUDE.md
-if ! diff -q "$CLAUDE_FILE" "$AGENTS_FILE" > /dev/null 2>&1; then
-    echo ""
-    echo "════════════════════════════════════════════════════════════════════════════"
-    echo "❌ ERROR: AGENTS.md must be identical to CLAUDE.md"
-    echo "════════════════════════════════════════════════════════════════════════════"
-    echo ""
-    echo "PROBLEM:"
-    echo "  You modified CLAUDE.md, but AGENTS.md already exists with different content."
-    echo "  These files MUST remain synchronized."
-    echo ""
-    echo "REQUIRED ACTION:"
-    echo "  AGENTS.md must either:"
-    echo "  • Be deleted (so it can be auto-created from CLAUDE.md), OR"
-    echo "  • Be manually updated to match CLAUDE.md exactly"
-    echo ""
-    echo "────────────────────────────────────────────────────────────────────────────"
-    echo "RESOLUTION OPTIONS:"
-    echo "────────────────────────────────────────────────────────────────────────────"
-    echo ""
-    echo "Option 1: Delete AGENTS.md (it will be recreated automatically)"
-    echo "  $ rm AGENTS.md"
-    echo "  $ git add AGENTS.md"
-    echo "  $ git commit"
-    echo ""
-    echo "Option 2: Review differences and reconcile manually"
-    echo "  $ diff CLAUDE.md AGENTS.md          # See what's different"
-    echo "  # Manually edit AGENTS.md to match CLAUDE.md (or vice versa)"
-    echo "  # Then copy CLAUDE.md to AGENTS.md:"
-    echo "  $ cp CLAUDE.md AGENTS.md"
-    echo "  $ git add AGENTS.md"
-    echo "  $ git commit"
-    echo ""
-    echo "Option 3: Move/rename AGENTS.md if you need to preserve it"
-    echo "  $ mv AGENTS.md AGENTS.md.backup"
-    echo "  $ git add AGENTS.md AGENTS.md.backup"
-    echo "  # Reconcile changes later, then:"
-    echo "  $ cp CLAUDE.md AGENTS.md"
-    echo "  $ git add AGENTS.md"
-    echo "  $ git commit"
-    echo ""
-    echo "════════════════════════════════════════════════════════════════════════════"
-    echo ""
-    exit 1
-fi
-
-# Files are identical, no action needed
-exit 0
diff --git a/AGENTS.md b/AGENTS.md
index 65e90ae8..25864f5b 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -1,188 +1,159 @@
-# VERA-MH: Validation of Ethical and Responsible AI in Mental Health
-Prototype for generating and evaluating LLM conversations in mental health contexts.
+# VERA-MH: Agent Guide
+
+Framework for generating and evaluating LLM conversations in mental health contexts. This file is for **any coding agent** (Cursor, Copilot, etc.). For Claude Code slash commands and `.claude/` maintenance, see [CLAUDE.md](./CLAUDE.md).
 
 ## Quick Start
+
 ```bash
-# Install uv if not already installed
 pip install uv
-
-# Set up environment and install dependencies
 uv sync
 source .venv/bin/activate  # Windows: .venv\Scripts\activate
-
-# Configure environment
-cp .env.example .env  # Add your API keys (ANTHROPIC_API_KEY, OPENAI_API_KEY)
+cp .env.example .env       # Add API keys (ANTHROPIC_API_KEY, OPENAI_API_KEY, etc.)
 ```
+
 **Python >= 3.11 required**
 
 ## Code Style
+
 - Minimal print statements
-- Prototype phase: prioritize clarity over perfection
-- Don't overthink implementation
-- Don't create example files
-- Use `python3` command explicitly
+- Prioritize clarity; match existing patterns in the module you touch
+- **Check for existing code first** — before adding a function or helper, search the repo for something that already does the job; extend or reuse it when possible
+- **Don't add abstractions unless asked** — avoid new base classes, wrappers, or indirection layers unless the task explicitly calls for them
+- Keep changes **small and understandable** — one logical change per edit; avoid drive-by refactors or unrelated cleanup in the same diff
+- When replacing behavior, **delete the old code** — don't leave dead paths, commented-out blocks, or "just in case" fallbacks behind
+- Don't create example files unless asked
+- Use `python3` or `uv run python` explicitly
+- Add or update tests when changing behavior
+
+## Architecture Map
+
+| Area | Key paths | When to edit |
+|------|-----------|--------------|
+| **Generation** | `generate.py`, `generate_conversations/` | Conversation simulation, turns, personas |
+| **Judging** | `judge.py`, `judge/` | Rubric scoring, TSV output, question navigation |
+| **LLM providers** | `llm_clients/`, `llm_clients/llm_factory.py` | New models, custom HTTP/API providers |
+| **Pipeline** | `run_pipeline.py`, `scripts/` | End-to-end generate → judge → score workflows |
+| **Data** | `data/` (personas, rubrics) | Evaluation inputs (committed) |
+| **Output** | `output/` (gitignored) | Generated transcripts, evaluations, logs |
+| **Config** | `utils/model_config_loader.py`, `llm_clients/config.py` | Model name resolution, API keys |
+| **Shared utils** | `utils/` | Naming, logging, conversation layout |
+
+**Entry points:** `generate.py` (simulate), `judge.py` (evaluate), `run_pipeline.py` (full workflow), `judge/score.py` (scoring/visualization).
+
+**Temporary experiments:** `tmp_tests/` (not committed). **Permanent tests:** `tests/`.
 
-## File Organization
-- **Temporary tests**: `tmp_tests/` (not committed)
-- **Main scripts**: `generate.py`, `judge.py` at root
-- **Core modules**: Implementation in main directory
-- **Docs**: See `docs/` for detailed guides
+## Testing
 
-## Code Quality Tools
-- **Formatting**: `uv run ruff format .`
-- **Linting**: `uv run ruff check .`
-- **Type checking**: `uv run pyright` (basic mode)
-- **Pre-commit**: `pre-commit install` (auto-run checks on commit)
-- All configuration in `pyproject.toml`
-- **📖 See**: `docs/pre-commit-hooks.md` for pre-commit documentation
+The project uses [pytest](https://docs.pytest.org/) with unit and integration tests under `tests/`. Coverage is enforced (`--cov-fail-under=30` in `pyproject.toml`).
 
-## Git Conventions
+**Layout:**
+- `tests/unit/` — fast, isolated tests
+- `tests/integration/` — component interactions and CLI flows
+- `tests/fixtures/` — rubrics, personas, sample conversations
+- `tests/mocks/` — shared LLM mocks
 
-### Commit Message Format
-Follow [Conventional Commits](https://www.conventionalcommits.org/) format:
+The `e2e` marker exists in `pyproject.toml` but there is no `tests/e2e/` directory yet; use `integration` for workflow-level tests.
 
-```
-<type>: <description>
+**Commands:**
+```bash
+# Default local/CI run (no API keys needed)
+uv run pytest -m "not live"
 
-[optional body]
-```
+# Full suite with coverage (default addopts include --cov)
+uv run pytest
 
-**Types:**
-- `feat`: New feature or significant enhancement
-- `fix`: Bug fix
-- `refactor`: Code restructuring without behavior change
-- `test`: Adding or updating tests
-- `docs`: Documentation changes only
-- `chore`: Maintenance tasks (dependencies, config, tooling)
-- `style`: Code style/formatting changes only
-- `perf`: Performance improvements
-
-**Guidelines:**
-- Keep subject line under 72 characters
-- Use imperative mood ("add feature" not "added feature")
-- Don't end subject line with a period
-- Separate subject from body with blank line
-- Focus on *why* the change was made, not *what* changed
-- Make atomic commits (one logical change per commit)
-
-**Examples:**
-```bash
-feat: add support for GPT-4 model evaluation
-fix: handle missing conversation files gracefully
-docs: update README with new model options
-chore: upgrade langchain to v0.1.0
-test: add unit tests for judge scoring logic
+# Live tests only (requires API keys in .env)
+uv run pytest -m live
+
+# Single file or directory
+uv run pytest tests/unit/judge/test_score.py
+uv run pytest tests/integration/
 ```
 
-### Branch Naming
-Use descriptive branch names with type prefixes:
+**Markers:** `unit`, `integration`, `e2e`, `live` (see `pyproject.toml`). CI runs `pytest -m "not live"`; live tests run in a separate job when secrets are available.
 
-**Format:** `<type>/<brief-description>`
+**Scratch scripts:** use `tmp_tests/` for one-off experiments, not committed tests.
 
-**Types:**
-- `feat/` - New features
-- `fix/` - Bug fixes
-- `refactor/` - Code refactoring
-- `test/` - Testing infrastructure
-- `docs/` - Documentation updates
-- `chore/` - Maintenance and tooling
+## Key Commands
 
-**Examples:**
 ```bash
-feat/add-gpt4-support
-fix/conversation-file-handling
-refactor/cleanup-judge-logic
-test/unit-test-infrastructure
-docs/update-api-examples
-chore/upgrade-dependencies
-```
+# End-to-end pipeline (preferred for full workflows)
+uv run python run_pipeline.py \
+  --user-agent claude-sonnet-4-5-20250929 \
+  --provider-agent gpt-4o \
+  --runs 1 \
+  --turns 10 \
+  --judge-model claude-sonnet-4-5-20250929 \
+  --max-personas 5
+
+# Generate conversations only
+uv run python generate.py \
+  -u claude-sonnet-4-5-20250929 \
+  -p gpt-4o \
+  -t 6 -r 1
+
+# Judge/evaluate an existing generation run
+uv run python judge.py \
+  -f output/{YOUR_P_RUN}/ \
+  -j claude-sonnet-4-5-20250929
+
+# Recommended published-score profile (scripted)
+./scripts/run_recommended_vera_pipeline.sh <provider-agent-model>
 
-**Guidelines:**
-- Use kebab-case (lowercase with hyphens)
-- Keep names concise but descriptive
-- Avoid generic names like `fix/bug` or `feat/new-feature`
-- Delete branches after merging
+# Development
+uv sync
+uv add <package>
+uv add --dev <pkg>
 
-### Workflow
-1. **Create branch from main**: `git checkout -b type/description`
-2. **Make changes**: Follow code style and write tests
-3. **Commit frequently**: Make atomic, logical commits
-4. **Run quality checks**: Pre-commit hooks run automatically
-5. **Push and create PR**: `git push -u origin branch-name`
-6. **Use `/create-commits`**: Let Claude Code organize commits logically
+# Code quality
+uv run ruff format .
+uv run ruff check .
+uv run pyright
+pre-commit run --all-files
+```
 
-**Tip:** Use `/create-commits` slash command to analyze changes and create well-organized, logical commits automatically.
+Use dated model IDs (e.g. `claude-sonnet-4-5-20250929`) as in README; shorthand aliases may not resolve.
 
-## Testing
-- No formal test suite yet (prototype phase)
-- For temporary test scripts: use `tmp_tests/`
-- When adding permanent tests: use `pytest` with `tests/` directory
-- Run tests: `pytest` (when tests exist)
-- Coverage: `pytest --cov` (when needed)
-
-### Claude Code Testing Configuration
-The project uses Claude Code with custom testing commands and agents:
-- **Slash commands** (`.claude/commands/`) - User-facing testing workflows
-- **test-engineer agent** (`.claude/agents/`) - Automated testing in parallel
-
-**Maintenance guidelines:**
-1. **When testing patterns change** (pytest config, fixtures, conventions):
-   - Review and update relevant slash commands (`/test`, `/create-tests`, etc.)
-   - Agent reads command files directly, so updates auto-propagate
-   - Only update agent if commands are added/removed
-
-2. **When adding new testing commands:**
-   - Add to `.claude/commands/`
-   - Update `.claude/commands/README.md` and main `README.md`
-   - If it contains testing patterns, add reference to `.claude/agents/test-engineer.md`
-
-**Why this matters:**
-- Agents use slash commands as living documentation (via Read tool)
-- Keeping them in sync ensures consistent testing patterns
-- Single source of truth prevents duplication and drift
-
-## Tech Stack
-- **LLM Framework**: LangChain (multi-provider support)
-- **Supported Providers**: Anthropic, OpenAI, Google GenAI
-- **Data Validation**: Pydantic v2
-- **Data Processing**: Pandas
-- **Config Management**: python-dotenv
+## Code Quality Tools
 
-## Key Commands
-```bash
-# Generate conversations
-python3 generate.py -u claude-sonnet-4-5 -p claude-sonnet-4-5 -t 6 -r 1
+- **Formatting:** `uv run ruff format .`
+- **Linting:** `uv run ruff check .`
+- **Type checking:** `uv run pyright` (basic mode)
+- **Pre-commit:** `pre-commit install` — see `docs/pre-commit-hooks.md`
+- Configuration: `pyproject.toml`
 
-# Judge/evaluate conversations
-python3 judge.py -f output/{YOUR_P_RUN}/ -j claude-sonnet-4-5
+## Git Conventions
 
-# Development
-uv sync              # Install/update dependencies
-uv add <package>     # Add new dependency
-uv add --dev <pkg>   # Add dev dependency
+### Commit Message Format
 
-# Code quality
-uv run ruff format .   # Format code
-uv run ruff check .    # Lint code
-uv run pyright         # Type check
-pre-commit run --all-files  # Run all pre-commit hooks
-
-# Testing (when implemented)
-pytest               # Run tests
-pytest --cov         # Run with coverage
+Follow [Conventional Commits](https://www.conventionalcommits.org/):
+
+```
+<type>: <description>
 ```
 
+Types: `feat`, `fix`, `refactor`, `test`, `docs`, `chore`, `style`, `perf`. Imperative mood, under 72 characters, no trailing period.
+
+### Branch Naming
+
+Format: `<type>/<brief-description>` (kebab-case), e.g. `feat/add-gpt4-support`, `fix/conversation-file-handling`.
+
+### Workflow
+
+1. Branch from `main`
+2. Make changes; run `uv run pytest -m "not live"` for code changes
+3. Atomic commits; pre-commit hooks run on commit
+4. Push and open a PR
+
 ## Documentation Reference
-- **Setup & Architecture**: See `README.md`
-- **Pre-commit Hooks**: See `docs/pre-commit-hooks.md`
-- **Custom LLM Providers**: See `docs/evaluating.md`
-- **Usage Examples**: See `README.md` → "Usage" section
-- **Model Configuration**: See `README.md` → "Models" section
+
+- **Setup, pipeline, output layout:** [README.md](./README.md)
+- **Custom LLM providers:** [docs/evaluating.md](./docs/evaluating.md)
+- **Pre-commit hooks:** [docs/pre-commit-hooks.md](./docs/pre-commit-hooks.md)
+- **Claude Code commands:** [CLAUDE.md](./CLAUDE.md), [.claude/commands/](./.claude/commands/)
 
 ## Docker
+
 ```bash
-docker-compose up    # Run via Docker
+docker-compose up
 ```
-
----
-For detailed information, see README.md and docs/
diff --git a/CLAUDE.md b/CLAUDE.md
index 65e90ae8..681b9c42 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -1,44 +1,47 @@
-# VERA-MH: Validation of Ethical and Responsible AI in Mental Health
-Prototype for generating and evaluating LLM conversations in mental health contexts.
+# VERA-MH: Claude Code Guide
+
+Framework for generating and evaluating LLM conversations in mental health contexts. This file is for **Claude Code** (slash commands, `.claude/` config). For agent-agnostic guidance (architecture, testing, domain guardrails), see [AGENTS.md](./AGENTS.md).
 
 ## Quick Start
+
 ```bash
-# Install uv if not already installed
 pip install uv
-
-# Set up environment and install dependencies
 uv sync
 source .venv/bin/activate  # Windows: .venv\Scripts\activate
-
-# Configure environment
-cp .env.example .env  # Add your API keys (ANTHROPIC_API_KEY, OPENAI_API_KEY)
+cp .env.example .env       # Add API keys (ANTHROPIC_API_KEY, OPENAI_API_KEY)
 ```
+
 **Python >= 3.11 required**
 
 ## Code Style
+
 - Minimal print statements
-- Prototype phase: prioritize clarity over perfection
-- Don't overthink implementation
-- Don't create example files
-- Use `python3` command explicitly
+- Prioritize clarity; match existing patterns in the module you touch
+- Don't create example files unless asked
+- Use `python3` or `uv run python` explicitly
+- Add or update tests when changing behavior
 
 ## File Organization
+
 - **Temporary tests**: `tmp_tests/` (not committed)
-- **Main scripts**: `generate.py`, `judge.py` at root
-- **Core modules**: Implementation in main directory
-- **Docs**: See `docs/` for detailed guides
+- **Main scripts**: `generate.py`, `judge.py`, `run_pipeline.py` at root
+- **Packages**: `generate_conversations/`, `judge/`, `llm_clients/`, `utils/`
+- **Permanent tests**: `tests/` (unit and integration)
+- **Docs**: `docs/`; agent architecture map in [AGENTS.md](./AGENTS.md)
 
 ## Code Quality Tools
+
 - **Formatting**: `uv run ruff format .`
 - **Linting**: `uv run ruff check .`
 - **Type checking**: `uv run pyright` (basic mode)
 - **Pre-commit**: `pre-commit install` (auto-run checks on commit)
 - All configuration in `pyproject.toml`
-- **📖 See**: `docs/pre-commit-hooks.md` for pre-commit documentation
+- See `docs/pre-commit-hooks.md` for pre-commit documentation
 
 ## Git Conventions
 
 ### Commit Message Format
+
 Follow [Conventional Commits](https://www.conventionalcommits.org/) format:
 
 ```
@@ -47,142 +50,84 @@ Follow [Conventional Commits](https://www.conventionalcommits.org/) format:
 [optional body]
 ```
 
-**Types:**
-- `feat`: New feature or significant enhancement
-- `fix`: Bug fix
-- `refactor`: Code restructuring without behavior change
-- `test`: Adding or updating tests
-- `docs`: Documentation changes only
-- `chore`: Maintenance tasks (dependencies, config, tooling)
-- `style`: Code style/formatting changes only
-- `perf`: Performance improvements
-
-**Guidelines:**
-- Keep subject line under 72 characters
-- Use imperative mood ("add feature" not "added feature")
-- Don't end subject line with a period
-- Separate subject from body with blank line
-- Focus on *why* the change was made, not *what* changed
-- Make atomic commits (one logical change per commit)
-
-**Examples:**
-```bash
-feat: add support for GPT-4 model evaluation
-fix: handle missing conversation files gracefully
-docs: update README with new model options
-chore: upgrade langchain to v0.1.0
-test: add unit tests for judge scoring logic
-```
-
-### Branch Naming
-Use descriptive branch names with type prefixes:
+**Types:** `feat`, `fix`, `refactor`, `test`, `docs`, `chore`, `style`, `perf`
 
-**Format:** `<type>/<brief-description>`
+**Guidelines:** Imperative mood, under 72 characters, no trailing period, atomic commits.
 
-**Types:**
-- `feat/` - New features
-- `fix/` - Bug fixes
-- `refactor/` - Code refactoring
-- `test/` - Testing infrastructure
-- `docs/` - Documentation updates
-- `chore/` - Maintenance and tooling
-
-**Examples:**
-```bash
-feat/add-gpt4-support
-fix/conversation-file-handling
-refactor/cleanup-judge-logic
-test/unit-test-infrastructure
-docs/update-api-examples
-chore/upgrade-dependencies
-```
+### Branch Naming
 
-**Guidelines:**
-- Use kebab-case (lowercase with hyphens)
-- Keep names concise but descriptive
-- Avoid generic names like `fix/bug` or `feat/new-feature`
-- Delete branches after merging
+**Format:** `<type>/<brief-description>` (kebab-case), e.g. `feat/add-gpt4-support`
 
 ### Workflow
+
 1. **Create branch from main**: `git checkout -b type/description`
-2. **Make changes**: Follow code style and write tests
-3. **Commit frequently**: Make atomic, logical commits
+2. **Make changes**: Follow code style; run `uv run pytest -m "not live"`
+3. **Commit frequently**: Atomic, logical commits
 4. **Run quality checks**: Pre-commit hooks run automatically
 5. **Push and create PR**: `git push -u origin branch-name`
 6. **Use `/create-commits`**: Let Claude Code organize commits logically
 
-**Tip:** Use `/create-commits` slash command to analyze changes and create well-organized, logical commits automatically.
-
 ## Testing
-- No formal test suite yet (prototype phase)
-- For temporary test scripts: use `tmp_tests/`
-- When adding permanent tests: use `pytest` with `tests/` directory
-- Run tests: `pytest` (when tests exist)
-- Coverage: `pytest --cov` (when needed)
+
+See [AGENTS.md](./AGENTS.md) for full testing policy. Summary:
+
+- `tests/unit/` and `tests/integration/`; fixtures in `tests/fixtures/`
+- Default: `uv run pytest -m "not live"` (CI-safe, no API keys)
+- Live API tests: `uv run pytest -m live`
+- Coverage enforced via `pyproject.toml` (`--cov-fail-under=30`)
 
 ### Claude Code Testing Configuration
-The project uses Claude Code with custom testing commands and agents:
-- **Slash commands** (`.claude/commands/`) - User-facing testing workflows
-- **test-engineer agent** (`.claude/agents/`) - Automated testing in parallel
+
+- **Slash commands** (`.claude/commands/`) — `/test`, `/fix-tests`, `/create-tests`
+- **test-engineer agent** (`.claude/agents/`) — parallel test runs
 
 **Maintenance guidelines:**
+
 1. **When testing patterns change** (pytest config, fixtures, conventions):
-   - Review and update relevant slash commands (`/test`, `/create-tests`, etc.)
-   - Agent reads command files directly, so updates auto-propagate
-   - Only update agent if commands are added/removed
+   - Update relevant slash commands (`/test`, `/create-tests`, etc.)
+   - Update [AGENTS.md](./AGENTS.md) if agent-facing policy changes
+   - Only update `test-engineer` agent if commands are added/removed
 
 2. **When adding new testing commands:**
    - Add to `.claude/commands/`
-   - Update `.claude/commands/README.md` and main `README.md`
-   - If it contains testing patterns, add reference to `.claude/agents/test-engineer.md`
-
-**Why this matters:**
-- Agents use slash commands as living documentation (via Read tool)
-- Keeping them in sync ensures consistent testing patterns
-- Single source of truth prevents duplication and drift
-
-## Tech Stack
-- **LLM Framework**: LangChain (multi-provider support)
-- **Supported Providers**: Anthropic, OpenAI, Google GenAI
-- **Data Validation**: Pydantic v2
-- **Data Processing**: Pandas
-- **Config Management**: python-dotenv
+   - Update `.claude/commands/README.md` and `README.md`
+   - Reference in `.claude/agents/test-engineer.md` if applicable
 
 ## Key Commands
+
 ```bash
-# Generate conversations
-python3 generate.py -u claude-sonnet-4-5 -p claude-sonnet-4-5 -t 6 -r 1
-
-# Judge/evaluate conversations
-python3 judge.py -f output/{YOUR_P_RUN}/ -j claude-sonnet-4-5
-
-# Development
-uv sync              # Install/update dependencies
-uv add <package>     # Add new dependency
-uv add --dev <pkg>   # Add dev dependency
-
-# Code quality
-uv run ruff format .   # Format code
-uv run ruff check .    # Lint code
-uv run pyright         # Type check
-pre-commit run --all-files  # Run all pre-commit hooks
-
-# Testing (when implemented)
-pytest               # Run tests
-pytest --cov         # Run with coverage
+# Slash-command alternatives: /run-generator, /run-judge, /test, /format
+
+# End-to-end pipeline
+uv run python run_pipeline.py \
+  --user-agent claude-sonnet-4-5-20250929 \
+  --provider-agent gpt-4o \
+  --runs 1 --turns 10 \
+  --judge-model claude-sonnet-4-5-20250929 \
+  --max-personas 5
+
+# Generate / judge (step by step)
+uv run python generate.py -u claude-sonnet-4-5-20250929 -p gpt-4o -t 6 -r 1
+uv run python judge.py -f output/{YOUR_P_RUN}/ -j claude-sonnet-4-5-20250929
+
+# Development & quality
+uv sync
+uv run ruff format .
+uv run ruff check .
+uv run pyright
+uv run pytest -m "not live"
+pre-commit run --all-files
 ```
 
 ## Documentation Reference
-- **Setup & Architecture**: See `README.md`
-- **Pre-commit Hooks**: See `docs/pre-commit-hooks.md`
-- **Custom LLM Providers**: See `docs/evaluating.md`
-- **Usage Examples**: See `README.md` → "Usage" section
-- **Model Configuration**: See `README.md` → "Models" section
+
+- **Agent guide (architecture, guardrails, testing):** [AGENTS.md](./AGENTS.md)
+- **Setup & usage:** [README.md](./README.md)
+- **Custom LLM providers:** [docs/evaluating.md](./docs/evaluating.md)
+- **Slash commands:** [.claude/commands/README.md](./.claude/commands/README.md)
 
 ## Docker
+
 ```bash
-docker-compose up    # Run via Docker
+docker-compose up
 ```
-
----
-For detailed information, see README.md and docs/
diff --git a/docs/pre-commit-hooks.md b/docs/pre-commit-hooks.md
index 1531b0ec..beb1d26e 100644
--- a/docs/pre-commit-hooks.md
+++ b/docs/pre-commit-hooks.md
@@ -10,40 +10,17 @@ pre-commit install   # Activates hooks
 ## Hooks
 
 ### Standard: Ruff
-Auto-formats and lints Python code. Configuration in `pyproject.toml`.
-
-### Custom: CLAUDE.md → AGENTS.md Sync
-
-**What:** Automatically keeps `AGENTS.md` identical to `CLAUDE.md`.
 
-**Why:** Both files must contain the same project instructions - `CLAUDE.md` for Claude Code, `AGENTS.md` for custom agents.
+Auto-formats and lints Python code. Configuration in `pyproject.toml`.
 
-**Behavior:**
-- If `AGENTS.md` doesn't exist → creates it from `CLAUDE.md`
-- If `AGENTS.md` exists and matches → passes
-- If `AGENTS.md` exists and differs → **fails with error**
+## Agent Documentation
 
-### Resolving Sync Conflicts
+`AGENTS.md` and `CLAUDE.md` are **intentionally separate**:
 
-If the hook fails, choose one:
+- **[AGENTS.md](../AGENTS.md)** — agent-agnostic guide (architecture, testing, domain guardrails, key commands)
+- **[CLAUDE.md](../CLAUDE.md)** — Claude Code slash commands and `.claude/` maintenance
 
-```bash
-# Option 1: Delete AGENTS.md (simplest - auto-recreated)
-rm AGENTS.md
-git add AGENTS.md
-git commit
-
-# Option 2: Reconcile manually
-diff CLAUDE.md AGENTS.md
-cp CLAUDE.md AGENTS.md
-git add AGENTS.md
-git commit
-
-# Option 3: Preserve current AGENTS.md temporarily
-mv AGENTS.md AGENTS.md.backup
-git add AGENTS.md AGENTS.md.backup
-git commit
-```
+Update the file that matches your audience. There is no pre-commit sync between them.
 
 ## Manual Usage
 
@@ -53,5 +30,4 @@ pre-commit run --all-files    # Run all hooks
 
 ## Configuration
 
-- `.pre-commit-config.yaml` - Hook configuration
-- `.pre-commit-scripts/sync-claude-to-agents.sh` - Custom sync script
+- `.pre-commit-config.yaml` — Hook configuration

From c2d6776ca3a567fc790eccf2358a364ebc060dd9 Mon Sep 17 00:00:00 2001
From: Luca Belli <129434630+sator-labs@users.noreply.github.com>
Date: Tue, 16 Jun 2026 17:13:09 -0700
Subject: [PATCH 2/4] docs: add documentation map and dedupe CLAUDE.md

Centralize doc ownership in AGENTS.md and keep CLAUDE.md focused on slash commands and Claude-specific workflow.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 AGENTS.md                |  19 ++++++-
 CLAUDE.md                | 111 ++++++---------------------------------
 docs/pre-commit-hooks.md |   2 +-
 3 files changed, 35 insertions(+), 97 deletions(-)

diff --git a/AGENTS.md b/AGENTS.md
index 25864f5b..06e9cc43 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -145,10 +145,27 @@ Format: `<type>/<brief-description>` (kebab-case), e.g. `feat/add-gpt4-support`,
 3. Atomic commits; pre-commit hooks run on commit
 4. Push and open a PR
 
-## Documentation Reference
+## Documentation Map
+
+One canonical home per concern — cross-link, don't copy paragraphs.
+
+| Doc | Audience | Use for |
+|-----|----------|---------|
+| [README.md](./README.md) | Humans | Setup, CLI usage, output layout, detailed architecture |
+| **AGENTS.md** (this file) | All coding agents | Style, architecture map, testing, key commands, git conventions |
+| [CLAUDE.md](./CLAUDE.md) | Claude Code only | Slash commands, `.claude/` maintenance |
+| [docs/](./docs/) | Humans and agents | Topic deep dives (see links below) |
+
+**When to update which file:** pytest/CI policy → AGENTS.md; new CLI flag or output layout → README (+ AGENTS key commands if agents run it often); LLM provider integration → [docs/evaluating.md](./docs/evaluating.md); Claude slash commands → `.claude/commands/` + CLAUDE.md + README command list.
+
+**OpenSpec:** not used in this repo. Consider [OpenSpec](https://github.com/Fission-AI/OpenSpec) only for large multi-file features where you want agreed behavioral specs before coding (e.g. new judge dimensions, pipeline CLI changes). It complements — does not replace — AGENTS.md or README.
+
+### Links
 
 - **Setup, pipeline, output layout:** [README.md](./README.md)
 - **Custom LLM providers:** [docs/evaluating.md](./docs/evaluating.md)
+- **Judge behavior:** [docs/judge.md](./docs/judge.md)
+- **Structured output:** [docs/structured-output.md](./docs/structured-output.md)
 - **Pre-commit hooks:** [docs/pre-commit-hooks.md](./docs/pre-commit-hooks.md)
 - **Claude Code commands:** [CLAUDE.md](./CLAUDE.md), [.claude/commands/](./.claude/commands/)
 
diff --git a/CLAUDE.md b/CLAUDE.md
index 681b9c42..2b0fda05 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -1,71 +1,24 @@
 # VERA-MH: Claude Code Guide
 
-Framework for generating and evaluating LLM conversations in mental health contexts. This file is for **Claude Code** (slash commands, `.claude/` config). For agent-agnostic guidance (architecture, testing, domain guardrails), see [AGENTS.md](./AGENTS.md).
+Framework for generating and evaluating LLM conversations in mental health contexts. This file is for **Claude Code** (slash commands, `.claude/` config). For agent-agnostic guidance (architecture, testing, domain guardrails, CLI commands, git conventions), see [AGENTS.md](./AGENTS.md).
 
-## Quick Start
+## Slash Commands
 
-```bash
-pip install uv
-uv sync
-source .venv/bin/activate  # Windows: .venv\Scripts\activate
-cp .env.example .env       # Add API keys (ANTHROPIC_API_KEY, OPENAI_API_KEY)
-```
+Prefer these over retyping CLI commands from [AGENTS.md](./AGENTS.md):
 
-**Python >= 3.11 required**
+| Area | Commands |
+|------|----------|
+| Setup | `/setup-dev` |
+| Code quality | `/format` |
+| VERA-MH | `/run-generator`, `/run-judge` |
+| Testing | `/test`, `/fix-tests`, `/create-tests [module] [--layer=unit\|integration\|e2e]` |
+| Git | `/create-commits`, `/create-pr` |
 
-## Code Style
+Full command docs: [.claude/commands/README.md](./.claude/commands/README.md)
 
-- Minimal print statements
-- Prioritize clarity; match existing patterns in the module you touch
-- Don't create example files unless asked
-- Use `python3` or `uv run python` explicitly
-- Add or update tests when changing behavior
+## Git Workflow (Claude Code)
 
-## File Organization
-
-- **Temporary tests**: `tmp_tests/` (not committed)
-- **Main scripts**: `generate.py`, `judge.py`, `run_pipeline.py` at root
-- **Packages**: `generate_conversations/`, `judge/`, `llm_clients/`, `utils/`
-- **Permanent tests**: `tests/` (unit and integration)
-- **Docs**: `docs/`; agent architecture map in [AGENTS.md](./AGENTS.md)
-
-## Code Quality Tools
-
-- **Formatting**: `uv run ruff format .`
-- **Linting**: `uv run ruff check .`
-- **Type checking**: `uv run pyright` (basic mode)
-- **Pre-commit**: `pre-commit install` (auto-run checks on commit)
-- All configuration in `pyproject.toml`
-- See `docs/pre-commit-hooks.md` for pre-commit documentation
-
-## Git Conventions
-
-### Commit Message Format
-
-Follow [Conventional Commits](https://www.conventionalcommits.org/) format:
-
-```
-<type>: <description>
-
-[optional body]
-```
-
-**Types:** `feat`, `fix`, `refactor`, `test`, `docs`, `chore`, `style`, `perf`
-
-**Guidelines:** Imperative mood, under 72 characters, no trailing period, atomic commits.
-
-### Branch Naming
-
-**Format:** `<type>/<brief-description>` (kebab-case), e.g. `feat/add-gpt4-support`
-
-### Workflow
-
-1. **Create branch from main**: `git checkout -b type/description`
-2. **Make changes**: Follow code style; run `uv run pytest -m "not live"`
-3. **Commit frequently**: Atomic, logical commits
-4. **Run quality checks**: Pre-commit hooks run automatically
-5. **Push and create PR**: `git push -u origin branch-name`
-6. **Use `/create-commits`**: Let Claude Code organize commits logically
+Use `/create-commits` to organize commits logically, then `/create-pr` for the pull request. All other git conventions are in [AGENTS.md](./AGENTS.md).
 
 ## Testing
 
@@ -93,41 +46,9 @@ See [AGENTS.md](./AGENTS.md) for full testing policy. Summary:
    - Update `.claude/commands/README.md` and `README.md`
    - Reference in `.claude/agents/test-engineer.md` if applicable
 
-## Key Commands
-
-```bash
-# Slash-command alternatives: /run-generator, /run-judge, /test, /format
-
-# End-to-end pipeline
-uv run python run_pipeline.py \
-  --user-agent claude-sonnet-4-5-20250929 \
-  --provider-agent gpt-4o \
-  --runs 1 --turns 10 \
-  --judge-model claude-sonnet-4-5-20250929 \
-  --max-personas 5
-
-# Generate / judge (step by step)
-uv run python generate.py -u claude-sonnet-4-5-20250929 -p gpt-4o -t 6 -r 1
-uv run python judge.py -f output/{YOUR_P_RUN}/ -j claude-sonnet-4-5-20250929
-
-# Development & quality
-uv sync
-uv run ruff format .
-uv run ruff check .
-uv run pyright
-uv run pytest -m "not live"
-pre-commit run --all-files
-```
-
 ## Documentation Reference
 
-- **Agent guide (architecture, guardrails, testing):** [AGENTS.md](./AGENTS.md)
-- **Setup & usage:** [README.md](./README.md)
-- **Custom LLM providers:** [docs/evaluating.md](./docs/evaluating.md)
-- **Slash commands:** [.claude/commands/README.md](./.claude/commands/README.md)
+See the **Documentation map** in [AGENTS.md](./AGENTS.md). Claude-specific entry points:
 
-## Docker
-
-```bash
-docker-compose up
-```
+- **Slash commands:** [.claude/commands/README.md](./.claude/commands/README.md)
+- **Team settings:** [`.claude/settings.json`](./.claude/settings.json) (shared); `.claude/settings.local.json` (personal, not committed)
diff --git a/docs/pre-commit-hooks.md b/docs/pre-commit-hooks.md
index beb1d26e..3ef454f9 100644
--- a/docs/pre-commit-hooks.md
+++ b/docs/pre-commit-hooks.md
@@ -15,7 +15,7 @@ Auto-formats and lints Python code. Configuration in `pyproject.toml`.
 
 ## Agent Documentation
 
-`AGENTS.md` and `CLAUDE.md` are **intentionally separate**:
+`AGENTS.md` and `CLAUDE.md` are **intentionally separate** (see the **Documentation map** in [AGENTS.md](../AGENTS.md)):
 
 - **[AGENTS.md](../AGENTS.md)** — agent-agnostic guide (architecture, testing, domain guardrails, key commands)
 - **[CLAUDE.md](../CLAUDE.md)** — Claude Code slash commands and `.claude/` maintenance

From 84521583550e9b72219c74eb76a995e5523bf8d5 Mon Sep 17 00:00:00 2001
From: Luca Belli <129434630+sator-labs@users.noreply.github.com>
Date: Tue, 16 Jun 2026 17:14:45 -0700
Subject: [PATCH 3/4] updating gitignore

---
 .gitignore | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.gitignore b/.gitignore
index 8cfca445..c9f57867 100644
--- a/.gitignore
+++ b/.gitignore
@@ -28,6 +28,10 @@ coverage.json
 htmlcov/
 !tests/fixtures/conversations/
 
+# IDEs
+.cursor/
+.vscode/
+
 # Claude Code - personal settings and local files only
 .claude/settings.local.json
 .claude/hooks/local/

From 7c66b89ded107e88223f7de40a935b9b2876020b Mon Sep 17 00:00:00 2001
From: Luca Belli <129434630+sator-labs@users.noreply.github.com>
Date: Tue, 16 Jun 2026 17:49:12 -0700
Subject: [PATCH 4/4] docs: add /review, /code-review, and /verify slash
 commands

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .claude/commands/verify.md | 11 +++++++++++
 CLAUDE.md                  |  3 ++-
 2 files changed, 13 insertions(+), 1 deletion(-)
 create mode 100644 .claude/commands/verify.md

diff --git a/.claude/commands/verify.md b/.claude/commands/verify.md
new file mode 100644
index 00000000..998863bc
--- /dev/null
+++ b/.claude/commands/verify.md
@@ -0,0 +1,11 @@
+Verify that a code change works correctly without hitting live APIs.
+
+IMPORTANT: Never run `generate.py`, `judge.py`, or `run_pipeline.py` directly — these make live LLM API calls. Verification must use the test suite only.
+
+Steps:
+1. Identify what changed (git diff --stat) to understand the scope
+2. Run the relevant tests with `uv run pytest -m "not live" -v` targeting changed modules where possible
+3. Run the full non-live suite to check for regressions: `uv run pytest -m "not live"`
+4. Report: which tests passed/failed, coverage delta if relevant, and whether the change behaves as expected based on test output
+
+If the change cannot be verified without a live API call, say so explicitly and suggest what a targeted live test would look like (but do not run it).
diff --git a/CLAUDE.md b/CLAUDE.md
index 2b0fda05..b31604f6 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -12,7 +12,8 @@ Prefer these over retyping CLI commands from [AGENTS.md](./AGENTS.md):
 | Code quality | `/format` |
 | VERA-MH | `/run-generator`, `/run-judge` |
 | Testing | `/test`, `/fix-tests`, `/create-tests [module] [--layer=unit\|integration\|e2e]` |
-| Git | `/create-commits`, `/create-pr` |
+| Git | `/create-commits`, `/create-pr`, `/review` (local diff), `/code-review` (PR) |
+| Verify | `/verify` (runs `pytest -m "not live"` — never hits live APIs) |
 
 Full command docs: [.claude/commands/README.md](./.claude/commands/README.md)