From 58ebd5499d08355e66b85bae4f995ef2d83fadbd Mon Sep 17 00:00:00 2001 From: Luca Belli <129434630+sator-labs@users.noreply.github.com> Date: Mon, 15 Jun 2026 09:38:03 -0700 Subject: [PATCH 1/4] first draft --- .pre-commit-config.yaml | 8 - .pre-commit-scripts/sync-claude-to-agents.sh | 71 ----- AGENTS.md | 269 +++++++++---------- CLAUDE.md | 197 +++++--------- docs/pre-commit-hooks.md | 38 +-- 5 files changed, 198 insertions(+), 385 deletions(-) delete mode 100755 .pre-commit-scripts/sync-claude-to-agents.sh diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 653988d1..6ba90604 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -6,11 +6,3 @@ repos: args: [--fix] - id: ruff-format - - repo: local - hooks: - - id: sync-claude-to-agents - name: Sync CLAUDE.md to AGENTS.md - entry: .pre-commit-scripts/sync-claude-to-agents.sh - language: script - files: ^CLAUDE\.md$ - pass_filenames: false diff --git a/.pre-commit-scripts/sync-claude-to-agents.sh b/.pre-commit-scripts/sync-claude-to-agents.sh deleted file mode 100755 index 043cd014..00000000 --- a/.pre-commit-scripts/sync-claude-to-agents.sh +++ /dev/null @@ -1,71 +0,0 @@ -#!/bin/bash -# Pre-commit hook to sync CLAUDE.md to AGENTS.md -# Fails if AGENTS.md exists and differs from CLAUDE.md - -set -e - -CLAUDE_FILE="CLAUDE.md" -AGENTS_FILE="AGENTS.md" - -# Check if CLAUDE.md exists -if [ ! -f "$CLAUDE_FILE" ]; then - echo "Error: $CLAUDE_FILE not found" - exit 1 -fi - -# If AGENTS.md doesn't exist, create it -if [ ! -f "$AGENTS_FILE" ]; then - echo "Creating $AGENTS_FILE from $CLAUDE_FILE" - cp "$CLAUDE_FILE" "$AGENTS_FILE" - git add "$AGENTS_FILE" - exit 0 -fi - -# If AGENTS.md exists, check if it differs from CLAUDE.md -if ! diff -q "$CLAUDE_FILE" "$AGENTS_FILE" > /dev/null 2>&1; then - echo "" - echo "════════════════════════════════════════════════════════════════════════════" - echo "❌ ERROR: AGENTS.md must be identical to CLAUDE.md" - echo "════════════════════════════════════════════════════════════════════════════" - echo "" - echo "PROBLEM:" - echo " You modified CLAUDE.md, but AGENTS.md already exists with different content." - echo " These files MUST remain synchronized." - echo "" - echo "REQUIRED ACTION:" - echo " AGENTS.md must either:" - echo " • Be deleted (so it can be auto-created from CLAUDE.md), OR" - echo " • Be manually updated to match CLAUDE.md exactly" - echo "" - echo "────────────────────────────────────────────────────────────────────────────" - echo "RESOLUTION OPTIONS:" - echo "────────────────────────────────────────────────────────────────────────────" - echo "" - echo "Option 1: Delete AGENTS.md (it will be recreated automatically)" - echo " $ rm AGENTS.md" - echo " $ git add AGENTS.md" - echo " $ git commit" - echo "" - echo "Option 2: Review differences and reconcile manually" - echo " $ diff CLAUDE.md AGENTS.md # See what's different" - echo " # Manually edit AGENTS.md to match CLAUDE.md (or vice versa)" - echo " # Then copy CLAUDE.md to AGENTS.md:" - echo " $ cp CLAUDE.md AGENTS.md" - echo " $ git add AGENTS.md" - echo " $ git commit" - echo "" - echo "Option 3: Move/rename AGENTS.md if you need to preserve it" - echo " $ mv AGENTS.md AGENTS.md.backup" - echo " $ git add AGENTS.md AGENTS.md.backup" - echo " # Reconcile changes later, then:" - echo " $ cp CLAUDE.md AGENTS.md" - echo " $ git add AGENTS.md" - echo " $ git commit" - echo "" - echo "════════════════════════════════════════════════════════════════════════════" - echo "" - exit 1 -fi - -# Files are identical, no action needed -exit 0 diff --git a/AGENTS.md b/AGENTS.md index 65e90ae8..25864f5b 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -1,188 +1,159 @@ -# VERA-MH: Validation of Ethical and Responsible AI in Mental Health -Prototype for generating and evaluating LLM conversations in mental health contexts. +# VERA-MH: Agent Guide + +Framework for generating and evaluating LLM conversations in mental health contexts. This file is for **any coding agent** (Cursor, Copilot, etc.). For Claude Code slash commands and `.claude/` maintenance, see [CLAUDE.md](./CLAUDE.md). ## Quick Start + ```bash -# Install uv if not already installed pip install uv - -# Set up environment and install dependencies uv sync source .venv/bin/activate # Windows: .venv\Scripts\activate - -# Configure environment -cp .env.example .env # Add your API keys (ANTHROPIC_API_KEY, OPENAI_API_KEY) +cp .env.example .env # Add API keys (ANTHROPIC_API_KEY, OPENAI_API_KEY, etc.) ``` + **Python >= 3.11 required** ## Code Style + - Minimal print statements -- Prototype phase: prioritize clarity over perfection -- Don't overthink implementation -- Don't create example files -- Use `python3` command explicitly +- Prioritize clarity; match existing patterns in the module you touch +- **Check for existing code first** — before adding a function or helper, search the repo for something that already does the job; extend or reuse it when possible +- **Don't add abstractions unless asked** — avoid new base classes, wrappers, or indirection layers unless the task explicitly calls for them +- Keep changes **small and understandable** — one logical change per edit; avoid drive-by refactors or unrelated cleanup in the same diff +- When replacing behavior, **delete the old code** — don't leave dead paths, commented-out blocks, or "just in case" fallbacks behind +- Don't create example files unless asked +- Use `python3` or `uv run python` explicitly +- Add or update tests when changing behavior + +## Architecture Map + +| Area | Key paths | When to edit | +|------|-----------|--------------| +| **Generation** | `generate.py`, `generate_conversations/` | Conversation simulation, turns, personas | +| **Judging** | `judge.py`, `judge/` | Rubric scoring, TSV output, question navigation | +| **LLM providers** | `llm_clients/`, `llm_clients/llm_factory.py` | New models, custom HTTP/API providers | +| **Pipeline** | `run_pipeline.py`, `scripts/` | End-to-end generate → judge → score workflows | +| **Data** | `data/` (personas, rubrics) | Evaluation inputs (committed) | +| **Output** | `output/` (gitignored) | Generated transcripts, evaluations, logs | +| **Config** | `utils/model_config_loader.py`, `llm_clients/config.py` | Model name resolution, API keys | +| **Shared utils** | `utils/` | Naming, logging, conversation layout | + +**Entry points:** `generate.py` (simulate), `judge.py` (evaluate), `run_pipeline.py` (full workflow), `judge/score.py` (scoring/visualization). + +**Temporary experiments:** `tmp_tests/` (not committed). **Permanent tests:** `tests/`. -## File Organization -- **Temporary tests**: `tmp_tests/` (not committed) -- **Main scripts**: `generate.py`, `judge.py` at root -- **Core modules**: Implementation in main directory -- **Docs**: See `docs/` for detailed guides +## Testing -## Code Quality Tools -- **Formatting**: `uv run ruff format .` -- **Linting**: `uv run ruff check .` -- **Type checking**: `uv run pyright` (basic mode) -- **Pre-commit**: `pre-commit install` (auto-run checks on commit) -- All configuration in `pyproject.toml` -- **📖 See**: `docs/pre-commit-hooks.md` for pre-commit documentation +The project uses [pytest](https://docs.pytest.org/) with unit and integration tests under `tests/`. Coverage is enforced (`--cov-fail-under=30` in `pyproject.toml`). -## Git Conventions +**Layout:** +- `tests/unit/` — fast, isolated tests +- `tests/integration/` — component interactions and CLI flows +- `tests/fixtures/` — rubrics, personas, sample conversations +- `tests/mocks/` — shared LLM mocks -### Commit Message Format -Follow [Conventional Commits](https://www.conventionalcommits.org/) format: +The `e2e` marker exists in `pyproject.toml` but there is no `tests/e2e/` directory yet; use `integration` for workflow-level tests. -``` -: +**Commands:** +```bash +# Default local/CI run (no API keys needed) +uv run pytest -m "not live" -[optional body] -``` +# Full suite with coverage (default addopts include --cov) +uv run pytest -**Types:** -- `feat`: New feature or significant enhancement -- `fix`: Bug fix -- `refactor`: Code restructuring without behavior change -- `test`: Adding or updating tests -- `docs`: Documentation changes only -- `chore`: Maintenance tasks (dependencies, config, tooling) -- `style`: Code style/formatting changes only -- `perf`: Performance improvements - -**Guidelines:** -- Keep subject line under 72 characters -- Use imperative mood ("add feature" not "added feature") -- Don't end subject line with a period -- Separate subject from body with blank line -- Focus on *why* the change was made, not *what* changed -- Make atomic commits (one logical change per commit) - -**Examples:** -```bash -feat: add support for GPT-4 model evaluation -fix: handle missing conversation files gracefully -docs: update README with new model options -chore: upgrade langchain to v0.1.0 -test: add unit tests for judge scoring logic +# Live tests only (requires API keys in .env) +uv run pytest -m live + +# Single file or directory +uv run pytest tests/unit/judge/test_score.py +uv run pytest tests/integration/ ``` -### Branch Naming -Use descriptive branch names with type prefixes: +**Markers:** `unit`, `integration`, `e2e`, `live` (see `pyproject.toml`). CI runs `pytest -m "not live"`; live tests run in a separate job when secrets are available. -**Format:** `/` +**Scratch scripts:** use `tmp_tests/` for one-off experiments, not committed tests. -**Types:** -- `feat/` - New features -- `fix/` - Bug fixes -- `refactor/` - Code refactoring -- `test/` - Testing infrastructure -- `docs/` - Documentation updates -- `chore/` - Maintenance and tooling +## Key Commands -**Examples:** ```bash -feat/add-gpt4-support -fix/conversation-file-handling -refactor/cleanup-judge-logic -test/unit-test-infrastructure -docs/update-api-examples -chore/upgrade-dependencies -``` +# End-to-end pipeline (preferred for full workflows) +uv run python run_pipeline.py \ + --user-agent claude-sonnet-4-5-20250929 \ + --provider-agent gpt-4o \ + --runs 1 \ + --turns 10 \ + --judge-model claude-sonnet-4-5-20250929 \ + --max-personas 5 + +# Generate conversations only +uv run python generate.py \ + -u claude-sonnet-4-5-20250929 \ + -p gpt-4o \ + -t 6 -r 1 + +# Judge/evaluate an existing generation run +uv run python judge.py \ + -f output/{YOUR_P_RUN}/ \ + -j claude-sonnet-4-5-20250929 + +# Recommended published-score profile (scripted) +./scripts/run_recommended_vera_pipeline.sh -**Guidelines:** -- Use kebab-case (lowercase with hyphens) -- Keep names concise but descriptive -- Avoid generic names like `fix/bug` or `feat/new-feature` -- Delete branches after merging +# Development +uv sync +uv add +uv add --dev -### Workflow -1. **Create branch from main**: `git checkout -b type/description` -2. **Make changes**: Follow code style and write tests -3. **Commit frequently**: Make atomic, logical commits -4. **Run quality checks**: Pre-commit hooks run automatically -5. **Push and create PR**: `git push -u origin branch-name` -6. **Use `/create-commits`**: Let Claude Code organize commits logically +# Code quality +uv run ruff format . +uv run ruff check . +uv run pyright +pre-commit run --all-files +``` -**Tip:** Use `/create-commits` slash command to analyze changes and create well-organized, logical commits automatically. +Use dated model IDs (e.g. `claude-sonnet-4-5-20250929`) as in README; shorthand aliases may not resolve. -## Testing -- No formal test suite yet (prototype phase) -- For temporary test scripts: use `tmp_tests/` -- When adding permanent tests: use `pytest` with `tests/` directory -- Run tests: `pytest` (when tests exist) -- Coverage: `pytest --cov` (when needed) - -### Claude Code Testing Configuration -The project uses Claude Code with custom testing commands and agents: -- **Slash commands** (`.claude/commands/`) - User-facing testing workflows -- **test-engineer agent** (`.claude/agents/`) - Automated testing in parallel - -**Maintenance guidelines:** -1. **When testing patterns change** (pytest config, fixtures, conventions): - - Review and update relevant slash commands (`/test`, `/create-tests`, etc.) - - Agent reads command files directly, so updates auto-propagate - - Only update agent if commands are added/removed - -2. **When adding new testing commands:** - - Add to `.claude/commands/` - - Update `.claude/commands/README.md` and main `README.md` - - If it contains testing patterns, add reference to `.claude/agents/test-engineer.md` - -**Why this matters:** -- Agents use slash commands as living documentation (via Read tool) -- Keeping them in sync ensures consistent testing patterns -- Single source of truth prevents duplication and drift - -## Tech Stack -- **LLM Framework**: LangChain (multi-provider support) -- **Supported Providers**: Anthropic, OpenAI, Google GenAI -- **Data Validation**: Pydantic v2 -- **Data Processing**: Pandas -- **Config Management**: python-dotenv +## Code Quality Tools -## Key Commands -```bash -# Generate conversations -python3 generate.py -u claude-sonnet-4-5 -p claude-sonnet-4-5 -t 6 -r 1 +- **Formatting:** `uv run ruff format .` +- **Linting:** `uv run ruff check .` +- **Type checking:** `uv run pyright` (basic mode) +- **Pre-commit:** `pre-commit install` — see `docs/pre-commit-hooks.md` +- Configuration: `pyproject.toml` -# Judge/evaluate conversations -python3 judge.py -f output/{YOUR_P_RUN}/ -j claude-sonnet-4-5 +## Git Conventions -# Development -uv sync # Install/update dependencies -uv add # Add new dependency -uv add --dev # Add dev dependency +### Commit Message Format -# Code quality -uv run ruff format . # Format code -uv run ruff check . # Lint code -uv run pyright # Type check -pre-commit run --all-files # Run all pre-commit hooks - -# Testing (when implemented) -pytest # Run tests -pytest --cov # Run with coverage +Follow [Conventional Commits](https://www.conventionalcommits.org/): + +``` +: ``` +Types: `feat`, `fix`, `refactor`, `test`, `docs`, `chore`, `style`, `perf`. Imperative mood, under 72 characters, no trailing period. + +### Branch Naming + +Format: `/` (kebab-case), e.g. `feat/add-gpt4-support`, `fix/conversation-file-handling`. + +### Workflow + +1. Branch from `main` +2. Make changes; run `uv run pytest -m "not live"` for code changes +3. Atomic commits; pre-commit hooks run on commit +4. Push and open a PR + ## Documentation Reference -- **Setup & Architecture**: See `README.md` -- **Pre-commit Hooks**: See `docs/pre-commit-hooks.md` -- **Custom LLM Providers**: See `docs/evaluating.md` -- **Usage Examples**: See `README.md` → "Usage" section -- **Model Configuration**: See `README.md` → "Models" section + +- **Setup, pipeline, output layout:** [README.md](./README.md) +- **Custom LLM providers:** [docs/evaluating.md](./docs/evaluating.md) +- **Pre-commit hooks:** [docs/pre-commit-hooks.md](./docs/pre-commit-hooks.md) +- **Claude Code commands:** [CLAUDE.md](./CLAUDE.md), [.claude/commands/](./.claude/commands/) ## Docker + ```bash -docker-compose up # Run via Docker +docker-compose up ``` - ---- -For detailed information, see README.md and docs/ diff --git a/CLAUDE.md b/CLAUDE.md index 65e90ae8..681b9c42 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -1,44 +1,47 @@ -# VERA-MH: Validation of Ethical and Responsible AI in Mental Health -Prototype for generating and evaluating LLM conversations in mental health contexts. +# VERA-MH: Claude Code Guide + +Framework for generating and evaluating LLM conversations in mental health contexts. This file is for **Claude Code** (slash commands, `.claude/` config). For agent-agnostic guidance (architecture, testing, domain guardrails), see [AGENTS.md](./AGENTS.md). ## Quick Start + ```bash -# Install uv if not already installed pip install uv - -# Set up environment and install dependencies uv sync source .venv/bin/activate # Windows: .venv\Scripts\activate - -# Configure environment -cp .env.example .env # Add your API keys (ANTHROPIC_API_KEY, OPENAI_API_KEY) +cp .env.example .env # Add API keys (ANTHROPIC_API_KEY, OPENAI_API_KEY) ``` + **Python >= 3.11 required** ## Code Style + - Minimal print statements -- Prototype phase: prioritize clarity over perfection -- Don't overthink implementation -- Don't create example files -- Use `python3` command explicitly +- Prioritize clarity; match existing patterns in the module you touch +- Don't create example files unless asked +- Use `python3` or `uv run python` explicitly +- Add or update tests when changing behavior ## File Organization + - **Temporary tests**: `tmp_tests/` (not committed) -- **Main scripts**: `generate.py`, `judge.py` at root -- **Core modules**: Implementation in main directory -- **Docs**: See `docs/` for detailed guides +- **Main scripts**: `generate.py`, `judge.py`, `run_pipeline.py` at root +- **Packages**: `generate_conversations/`, `judge/`, `llm_clients/`, `utils/` +- **Permanent tests**: `tests/` (unit and integration) +- **Docs**: `docs/`; agent architecture map in [AGENTS.md](./AGENTS.md) ## Code Quality Tools + - **Formatting**: `uv run ruff format .` - **Linting**: `uv run ruff check .` - **Type checking**: `uv run pyright` (basic mode) - **Pre-commit**: `pre-commit install` (auto-run checks on commit) - All configuration in `pyproject.toml` -- **📖 See**: `docs/pre-commit-hooks.md` for pre-commit documentation +- See `docs/pre-commit-hooks.md` for pre-commit documentation ## Git Conventions ### Commit Message Format + Follow [Conventional Commits](https://www.conventionalcommits.org/) format: ``` @@ -47,142 +50,84 @@ Follow [Conventional Commits](https://www.conventionalcommits.org/) format: [optional body] ``` -**Types:** -- `feat`: New feature or significant enhancement -- `fix`: Bug fix -- `refactor`: Code restructuring without behavior change -- `test`: Adding or updating tests -- `docs`: Documentation changes only -- `chore`: Maintenance tasks (dependencies, config, tooling) -- `style`: Code style/formatting changes only -- `perf`: Performance improvements - -**Guidelines:** -- Keep subject line under 72 characters -- Use imperative mood ("add feature" not "added feature") -- Don't end subject line with a period -- Separate subject from body with blank line -- Focus on *why* the change was made, not *what* changed -- Make atomic commits (one logical change per commit) - -**Examples:** -```bash -feat: add support for GPT-4 model evaluation -fix: handle missing conversation files gracefully -docs: update README with new model options -chore: upgrade langchain to v0.1.0 -test: add unit tests for judge scoring logic -``` - -### Branch Naming -Use descriptive branch names with type prefixes: +**Types:** `feat`, `fix`, `refactor`, `test`, `docs`, `chore`, `style`, `perf` -**Format:** `/` +**Guidelines:** Imperative mood, under 72 characters, no trailing period, atomic commits. -**Types:** -- `feat/` - New features -- `fix/` - Bug fixes -- `refactor/` - Code refactoring -- `test/` - Testing infrastructure -- `docs/` - Documentation updates -- `chore/` - Maintenance and tooling - -**Examples:** -```bash -feat/add-gpt4-support -fix/conversation-file-handling -refactor/cleanup-judge-logic -test/unit-test-infrastructure -docs/update-api-examples -chore/upgrade-dependencies -``` +### Branch Naming -**Guidelines:** -- Use kebab-case (lowercase with hyphens) -- Keep names concise but descriptive -- Avoid generic names like `fix/bug` or `feat/new-feature` -- Delete branches after merging +**Format:** `/` (kebab-case), e.g. `feat/add-gpt4-support` ### Workflow + 1. **Create branch from main**: `git checkout -b type/description` -2. **Make changes**: Follow code style and write tests -3. **Commit frequently**: Make atomic, logical commits +2. **Make changes**: Follow code style; run `uv run pytest -m "not live"` +3. **Commit frequently**: Atomic, logical commits 4. **Run quality checks**: Pre-commit hooks run automatically 5. **Push and create PR**: `git push -u origin branch-name` 6. **Use `/create-commits`**: Let Claude Code organize commits logically -**Tip:** Use `/create-commits` slash command to analyze changes and create well-organized, logical commits automatically. - ## Testing -- No formal test suite yet (prototype phase) -- For temporary test scripts: use `tmp_tests/` -- When adding permanent tests: use `pytest` with `tests/` directory -- Run tests: `pytest` (when tests exist) -- Coverage: `pytest --cov` (when needed) + +See [AGENTS.md](./AGENTS.md) for full testing policy. Summary: + +- `tests/unit/` and `tests/integration/`; fixtures in `tests/fixtures/` +- Default: `uv run pytest -m "not live"` (CI-safe, no API keys) +- Live API tests: `uv run pytest -m live` +- Coverage enforced via `pyproject.toml` (`--cov-fail-under=30`) ### Claude Code Testing Configuration -The project uses Claude Code with custom testing commands and agents: -- **Slash commands** (`.claude/commands/`) - User-facing testing workflows -- **test-engineer agent** (`.claude/agents/`) - Automated testing in parallel + +- **Slash commands** (`.claude/commands/`) — `/test`, `/fix-tests`, `/create-tests` +- **test-engineer agent** (`.claude/agents/`) — parallel test runs **Maintenance guidelines:** + 1. **When testing patterns change** (pytest config, fixtures, conventions): - - Review and update relevant slash commands (`/test`, `/create-tests`, etc.) - - Agent reads command files directly, so updates auto-propagate - - Only update agent if commands are added/removed + - Update relevant slash commands (`/test`, `/create-tests`, etc.) + - Update [AGENTS.md](./AGENTS.md) if agent-facing policy changes + - Only update `test-engineer` agent if commands are added/removed 2. **When adding new testing commands:** - Add to `.claude/commands/` - - Update `.claude/commands/README.md` and main `README.md` - - If it contains testing patterns, add reference to `.claude/agents/test-engineer.md` - -**Why this matters:** -- Agents use slash commands as living documentation (via Read tool) -- Keeping them in sync ensures consistent testing patterns -- Single source of truth prevents duplication and drift - -## Tech Stack -- **LLM Framework**: LangChain (multi-provider support) -- **Supported Providers**: Anthropic, OpenAI, Google GenAI -- **Data Validation**: Pydantic v2 -- **Data Processing**: Pandas -- **Config Management**: python-dotenv + - Update `.claude/commands/README.md` and `README.md` + - Reference in `.claude/agents/test-engineer.md` if applicable ## Key Commands + ```bash -# Generate conversations -python3 generate.py -u claude-sonnet-4-5 -p claude-sonnet-4-5 -t 6 -r 1 - -# Judge/evaluate conversations -python3 judge.py -f output/{YOUR_P_RUN}/ -j claude-sonnet-4-5 - -# Development -uv sync # Install/update dependencies -uv add # Add new dependency -uv add --dev # Add dev dependency - -# Code quality -uv run ruff format . # Format code -uv run ruff check . # Lint code -uv run pyright # Type check -pre-commit run --all-files # Run all pre-commit hooks - -# Testing (when implemented) -pytest # Run tests -pytest --cov # Run with coverage +# Slash-command alternatives: /run-generator, /run-judge, /test, /format + +# End-to-end pipeline +uv run python run_pipeline.py \ + --user-agent claude-sonnet-4-5-20250929 \ + --provider-agent gpt-4o \ + --runs 1 --turns 10 \ + --judge-model claude-sonnet-4-5-20250929 \ + --max-personas 5 + +# Generate / judge (step by step) +uv run python generate.py -u claude-sonnet-4-5-20250929 -p gpt-4o -t 6 -r 1 +uv run python judge.py -f output/{YOUR_P_RUN}/ -j claude-sonnet-4-5-20250929 + +# Development & quality +uv sync +uv run ruff format . +uv run ruff check . +uv run pyright +uv run pytest -m "not live" +pre-commit run --all-files ``` ## Documentation Reference -- **Setup & Architecture**: See `README.md` -- **Pre-commit Hooks**: See `docs/pre-commit-hooks.md` -- **Custom LLM Providers**: See `docs/evaluating.md` -- **Usage Examples**: See `README.md` → "Usage" section -- **Model Configuration**: See `README.md` → "Models" section + +- **Agent guide (architecture, guardrails, testing):** [AGENTS.md](./AGENTS.md) +- **Setup & usage:** [README.md](./README.md) +- **Custom LLM providers:** [docs/evaluating.md](./docs/evaluating.md) +- **Slash commands:** [.claude/commands/README.md](./.claude/commands/README.md) ## Docker + ```bash -docker-compose up # Run via Docker +docker-compose up ``` - ---- -For detailed information, see README.md and docs/ diff --git a/docs/pre-commit-hooks.md b/docs/pre-commit-hooks.md index 1531b0ec..beb1d26e 100644 --- a/docs/pre-commit-hooks.md +++ b/docs/pre-commit-hooks.md @@ -10,40 +10,17 @@ pre-commit install # Activates hooks ## Hooks ### Standard: Ruff -Auto-formats and lints Python code. Configuration in `pyproject.toml`. - -### Custom: CLAUDE.md → AGENTS.md Sync - -**What:** Automatically keeps `AGENTS.md` identical to `CLAUDE.md`. -**Why:** Both files must contain the same project instructions - `CLAUDE.md` for Claude Code, `AGENTS.md` for custom agents. +Auto-formats and lints Python code. Configuration in `pyproject.toml`. -**Behavior:** -- If `AGENTS.md` doesn't exist → creates it from `CLAUDE.md` -- If `AGENTS.md` exists and matches → passes -- If `AGENTS.md` exists and differs → **fails with error** +## Agent Documentation -### Resolving Sync Conflicts +`AGENTS.md` and `CLAUDE.md` are **intentionally separate**: -If the hook fails, choose one: +- **[AGENTS.md](../AGENTS.md)** — agent-agnostic guide (architecture, testing, domain guardrails, key commands) +- **[CLAUDE.md](../CLAUDE.md)** — Claude Code slash commands and `.claude/` maintenance -```bash -# Option 1: Delete AGENTS.md (simplest - auto-recreated) -rm AGENTS.md -git add AGENTS.md -git commit - -# Option 2: Reconcile manually -diff CLAUDE.md AGENTS.md -cp CLAUDE.md AGENTS.md -git add AGENTS.md -git commit - -# Option 3: Preserve current AGENTS.md temporarily -mv AGENTS.md AGENTS.md.backup -git add AGENTS.md AGENTS.md.backup -git commit -``` +Update the file that matches your audience. There is no pre-commit sync between them. ## Manual Usage @@ -53,5 +30,4 @@ pre-commit run --all-files # Run all hooks ## Configuration -- `.pre-commit-config.yaml` - Hook configuration -- `.pre-commit-scripts/sync-claude-to-agents.sh` - Custom sync script +- `.pre-commit-config.yaml` — Hook configuration From c2d6776ca3a567fc790eccf2358a364ebc060dd9 Mon Sep 17 00:00:00 2001 From: Luca Belli <129434630+sator-labs@users.noreply.github.com> Date: Tue, 16 Jun 2026 17:13:09 -0700 Subject: [PATCH 2/4] docs: add documentation map and dedupe CLAUDE.md Centralize doc ownership in AGENTS.md and keep CLAUDE.md focused on slash commands and Claude-specific workflow. Co-authored-by: Cursor --- AGENTS.md | 19 ++++++- CLAUDE.md | 111 ++++++--------------------------------- docs/pre-commit-hooks.md | 2 +- 3 files changed, 35 insertions(+), 97 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index 25864f5b..06e9cc43 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -145,10 +145,27 @@ Format: `/` (kebab-case), e.g. `feat/add-gpt4-support`, 3. Atomic commits; pre-commit hooks run on commit 4. Push and open a PR -## Documentation Reference +## Documentation Map + +One canonical home per concern — cross-link, don't copy paragraphs. + +| Doc | Audience | Use for | +|-----|----------|---------| +| [README.md](./README.md) | Humans | Setup, CLI usage, output layout, detailed architecture | +| **AGENTS.md** (this file) | All coding agents | Style, architecture map, testing, key commands, git conventions | +| [CLAUDE.md](./CLAUDE.md) | Claude Code only | Slash commands, `.claude/` maintenance | +| [docs/](./docs/) | Humans and agents | Topic deep dives (see links below) | + +**When to update which file:** pytest/CI policy → AGENTS.md; new CLI flag or output layout → README (+ AGENTS key commands if agents run it often); LLM provider integration → [docs/evaluating.md](./docs/evaluating.md); Claude slash commands → `.claude/commands/` + CLAUDE.md + README command list. + +**OpenSpec:** not used in this repo. Consider [OpenSpec](https://github.com/Fission-AI/OpenSpec) only for large multi-file features where you want agreed behavioral specs before coding (e.g. new judge dimensions, pipeline CLI changes). It complements — does not replace — AGENTS.md or README. + +### Links - **Setup, pipeline, output layout:** [README.md](./README.md) - **Custom LLM providers:** [docs/evaluating.md](./docs/evaluating.md) +- **Judge behavior:** [docs/judge.md](./docs/judge.md) +- **Structured output:** [docs/structured-output.md](./docs/structured-output.md) - **Pre-commit hooks:** [docs/pre-commit-hooks.md](./docs/pre-commit-hooks.md) - **Claude Code commands:** [CLAUDE.md](./CLAUDE.md), [.claude/commands/](./.claude/commands/) diff --git a/CLAUDE.md b/CLAUDE.md index 681b9c42..2b0fda05 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -1,71 +1,24 @@ # VERA-MH: Claude Code Guide -Framework for generating and evaluating LLM conversations in mental health contexts. This file is for **Claude Code** (slash commands, `.claude/` config). For agent-agnostic guidance (architecture, testing, domain guardrails), see [AGENTS.md](./AGENTS.md). +Framework for generating and evaluating LLM conversations in mental health contexts. This file is for **Claude Code** (slash commands, `.claude/` config). For agent-agnostic guidance (architecture, testing, domain guardrails, CLI commands, git conventions), see [AGENTS.md](./AGENTS.md). -## Quick Start +## Slash Commands -```bash -pip install uv -uv sync -source .venv/bin/activate # Windows: .venv\Scripts\activate -cp .env.example .env # Add API keys (ANTHROPIC_API_KEY, OPENAI_API_KEY) -``` +Prefer these over retyping CLI commands from [AGENTS.md](./AGENTS.md): -**Python >= 3.11 required** +| Area | Commands | +|------|----------| +| Setup | `/setup-dev` | +| Code quality | `/format` | +| VERA-MH | `/run-generator`, `/run-judge` | +| Testing | `/test`, `/fix-tests`, `/create-tests [module] [--layer=unit\|integration\|e2e]` | +| Git | `/create-commits`, `/create-pr` | -## Code Style +Full command docs: [.claude/commands/README.md](./.claude/commands/README.md) -- Minimal print statements -- Prioritize clarity; match existing patterns in the module you touch -- Don't create example files unless asked -- Use `python3` or `uv run python` explicitly -- Add or update tests when changing behavior +## Git Workflow (Claude Code) -## File Organization - -- **Temporary tests**: `tmp_tests/` (not committed) -- **Main scripts**: `generate.py`, `judge.py`, `run_pipeline.py` at root -- **Packages**: `generate_conversations/`, `judge/`, `llm_clients/`, `utils/` -- **Permanent tests**: `tests/` (unit and integration) -- **Docs**: `docs/`; agent architecture map in [AGENTS.md](./AGENTS.md) - -## Code Quality Tools - -- **Formatting**: `uv run ruff format .` -- **Linting**: `uv run ruff check .` -- **Type checking**: `uv run pyright` (basic mode) -- **Pre-commit**: `pre-commit install` (auto-run checks on commit) -- All configuration in `pyproject.toml` -- See `docs/pre-commit-hooks.md` for pre-commit documentation - -## Git Conventions - -### Commit Message Format - -Follow [Conventional Commits](https://www.conventionalcommits.org/) format: - -``` -: - -[optional body] -``` - -**Types:** `feat`, `fix`, `refactor`, `test`, `docs`, `chore`, `style`, `perf` - -**Guidelines:** Imperative mood, under 72 characters, no trailing period, atomic commits. - -### Branch Naming - -**Format:** `/` (kebab-case), e.g. `feat/add-gpt4-support` - -### Workflow - -1. **Create branch from main**: `git checkout -b type/description` -2. **Make changes**: Follow code style; run `uv run pytest -m "not live"` -3. **Commit frequently**: Atomic, logical commits -4. **Run quality checks**: Pre-commit hooks run automatically -5. **Push and create PR**: `git push -u origin branch-name` -6. **Use `/create-commits`**: Let Claude Code organize commits logically +Use `/create-commits` to organize commits logically, then `/create-pr` for the pull request. All other git conventions are in [AGENTS.md](./AGENTS.md). ## Testing @@ -93,41 +46,9 @@ See [AGENTS.md](./AGENTS.md) for full testing policy. Summary: - Update `.claude/commands/README.md` and `README.md` - Reference in `.claude/agents/test-engineer.md` if applicable -## Key Commands - -```bash -# Slash-command alternatives: /run-generator, /run-judge, /test, /format - -# End-to-end pipeline -uv run python run_pipeline.py \ - --user-agent claude-sonnet-4-5-20250929 \ - --provider-agent gpt-4o \ - --runs 1 --turns 10 \ - --judge-model claude-sonnet-4-5-20250929 \ - --max-personas 5 - -# Generate / judge (step by step) -uv run python generate.py -u claude-sonnet-4-5-20250929 -p gpt-4o -t 6 -r 1 -uv run python judge.py -f output/{YOUR_P_RUN}/ -j claude-sonnet-4-5-20250929 - -# Development & quality -uv sync -uv run ruff format . -uv run ruff check . -uv run pyright -uv run pytest -m "not live" -pre-commit run --all-files -``` - ## Documentation Reference -- **Agent guide (architecture, guardrails, testing):** [AGENTS.md](./AGENTS.md) -- **Setup & usage:** [README.md](./README.md) -- **Custom LLM providers:** [docs/evaluating.md](./docs/evaluating.md) -- **Slash commands:** [.claude/commands/README.md](./.claude/commands/README.md) +See the **Documentation map** in [AGENTS.md](./AGENTS.md). Claude-specific entry points: -## Docker - -```bash -docker-compose up -``` +- **Slash commands:** [.claude/commands/README.md](./.claude/commands/README.md) +- **Team settings:** [`.claude/settings.json`](./.claude/settings.json) (shared); `.claude/settings.local.json` (personal, not committed) diff --git a/docs/pre-commit-hooks.md b/docs/pre-commit-hooks.md index beb1d26e..3ef454f9 100644 --- a/docs/pre-commit-hooks.md +++ b/docs/pre-commit-hooks.md @@ -15,7 +15,7 @@ Auto-formats and lints Python code. Configuration in `pyproject.toml`. ## Agent Documentation -`AGENTS.md` and `CLAUDE.md` are **intentionally separate**: +`AGENTS.md` and `CLAUDE.md` are **intentionally separate** (see the **Documentation map** in [AGENTS.md](../AGENTS.md)): - **[AGENTS.md](../AGENTS.md)** — agent-agnostic guide (architecture, testing, domain guardrails, key commands) - **[CLAUDE.md](../CLAUDE.md)** — Claude Code slash commands and `.claude/` maintenance From 84521583550e9b72219c74eb76a995e5523bf8d5 Mon Sep 17 00:00:00 2001 From: Luca Belli <129434630+sator-labs@users.noreply.github.com> Date: Tue, 16 Jun 2026 17:14:45 -0700 Subject: [PATCH 3/4] updating gitignore --- .gitignore | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.gitignore b/.gitignore index 8cfca445..c9f57867 100644 --- a/.gitignore +++ b/.gitignore @@ -28,6 +28,10 @@ coverage.json htmlcov/ !tests/fixtures/conversations/ +# IDEs +.cursor/ +.vscode/ + # Claude Code - personal settings and local files only .claude/settings.local.json .claude/hooks/local/ From 7c66b89ded107e88223f7de40a935b9b2876020b Mon Sep 17 00:00:00 2001 From: Luca Belli <129434630+sator-labs@users.noreply.github.com> Date: Tue, 16 Jun 2026 17:49:12 -0700 Subject: [PATCH 4/4] docs: add /review, /code-review, and /verify slash commands Co-Authored-By: Claude Sonnet 4.6 --- .claude/commands/verify.md | 11 +++++++++++ CLAUDE.md | 3 ++- 2 files changed, 13 insertions(+), 1 deletion(-) create mode 100644 .claude/commands/verify.md diff --git a/.claude/commands/verify.md b/.claude/commands/verify.md new file mode 100644 index 00000000..998863bc --- /dev/null +++ b/.claude/commands/verify.md @@ -0,0 +1,11 @@ +Verify that a code change works correctly without hitting live APIs. + +IMPORTANT: Never run `generate.py`, `judge.py`, or `run_pipeline.py` directly — these make live LLM API calls. Verification must use the test suite only. + +Steps: +1. Identify what changed (git diff --stat) to understand the scope +2. Run the relevant tests with `uv run pytest -m "not live" -v` targeting changed modules where possible +3. Run the full non-live suite to check for regressions: `uv run pytest -m "not live"` +4. Report: which tests passed/failed, coverage delta if relevant, and whether the change behaves as expected based on test output + +If the change cannot be verified without a live API call, say so explicitly and suggest what a targeted live test would look like (but do not run it). diff --git a/CLAUDE.md b/CLAUDE.md index 2b0fda05..b31604f6 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -12,7 +12,8 @@ Prefer these over retyping CLI commands from [AGENTS.md](./AGENTS.md): | Code quality | `/format` | | VERA-MH | `/run-generator`, `/run-judge` | | Testing | `/test`, `/fix-tests`, `/create-tests [module] [--layer=unit\|integration\|e2e]` | -| Git | `/create-commits`, `/create-pr` | +| Git | `/create-commits`, `/create-pr`, `/review` (local diff), `/code-review` (PR) | +| Verify | `/verify` (runs `pytest -m "not live"` — never hits live APIs) | Full command docs: [.claude/commands/README.md](./.claude/commands/README.md)