From b739daeca692066e8c6868a704e2bab1dab98352 Mon Sep 17 00:00:00 2001
From: Oleg Shulyakov <olegshulyakov@users.noreply.github.com>
Date: Thu, 21 May 2026 09:53:06 +0300
Subject: [PATCH] feat(skills): add eval coverage requirements to the
 creator-skill

- Update skill workflow step 3 to verify router skill references have 8-10 evals each
- Document `evals/evals.json` creation requirement in authoring reference
- Document `reference` field requirement in evaluation reference
- Add `evals[].reference` field to the eval schema definition
---
 .agents/skills/creator-skill/SKILL.md         |   2 +-
 .agents/skills/creator-skill/evals/evals.json | 606 ++++++++++++++++--
 .../creator-skill/references/authoring.md     |   2 +
 .../creator-skill/references/evaluation.md    |   2 +
 .../creator-skill/references/schemas.md       |   2 +
 .../creator-skill/scripts/quick_validate.py   |  85 +++
 6 files changed, 644 insertions(+), 55 deletions(-)

diff --git a/.agents/skills/creator-skill/SKILL.md b/.agents/skills/creator-skill/SKILL.md
index f30ed1c..b392f58 100644
--- a/.agents/skills/creator-skill/SKILL.md
+++ b/.agents/skills/creator-skill/SKILL.md
@@ -35,7 +35,7 @@ If the request spans multiple phases, read the references in workflow order: aut
 
 1. **Clarify scope**: identify what the skill should do, which user phrases or contexts should trigger it, what output it should produce, and whether objective evals are useful.
 2. **Write the skill**: revise `SKILL.md` with concise metadata, focused instructions, bold scan anchors, and references for details that would bloat the main file.
-3. **Test behavior**: run this skill's `scripts/quick_validate.py` against the target skill; for objectively testable skills, create realistic eval prompts and run skill-enabled outputs against a meaningful baseline.
+3. **Test behavior**: run this skill's `scripts/quick_validate.py` against the target skill; for router skills, confirm every `references/*.md` file has 8-10 evals mapped by `reference`; for objectively testable skills, run skill-enabled outputs against a meaningful baseline.
 4. **Show evidence**: share outputs and benchmark results with the user before making another revision.
 5. **Iterate deliberately**: continue until feedback is resolved or further changes stop improving results.
 6. **Package last**: package the final skill only after the user is satisfied with behavior and trigger accuracy.
diff --git a/.agents/skills/creator-skill/evals/evals.json b/.agents/skills/creator-skill/evals/evals.json
index ae20add..83581bc 100644
--- a/.agents/skills/creator-skill/evals/evals.json
+++ b/.agents/skills/creator-skill/evals/evals.json
@@ -3,104 +3,602 @@
   "evals": [
     {
       "id": 1,
-      "prompt": "Create a new skill for writing incident postmortems. It should trigger on postmortem, incident review, RCA, and retrospective requests. Include a concise SKILL.md and any references you think are needed.",
-      "expected_output": "A new skill folder design with concise metadata, a focused SKILL.md body, and references only where they reduce main-file size.",
+      "reference": "references/authoring.md",
+      "prompt": "Create a new skill for incident postmortems with concise SKILL.md and only useful references.",
+      "expected_output": "A response that routes to references/authoring.md and applies the creator-skill workflow for the requested task.",
       "files": [],
       "expectations": [
-        "Metadata/frontmatter is kept under 100 tokens",
-        "Instruction body is kept under 5000 tokens",
-        "Trigger cues are in the description, not scattered only in the body",
-        "References are added only for meaningful detail",
-        "No placeholder directories are created"
+        "Routes to authoring guidance",
+        "Produces or revises SKILL.md instructions",
+        "Keeps metadata and body budgets in mind",
+        "Uses references only when they reduce main-file complexity",
+        "Avoids placeholder bundled resources",
+        "Preserves portability and safety"
       ]
     },
     {
       "id": 2,
-      "prompt": "This existing skill has a 7000-token SKILL.md with sections for AWS, GCP, Azure, Kubernetes, and Terraform. Refactor it without changing what it can do.",
-      "expected_output": "A router-style refactor plan or patch that keeps the main SKILL.md concise and moves provider-specific guidance into references.",
+      "reference": "references/authoring.md",
+      "prompt": "Refactor a 7000-token cloud skill into a router without changing capabilities.",
+      "expected_output": "A response that routes to references/authoring.md and applies the creator-skill workflow for the requested task.",
       "files": [],
       "expectations": [
-        "Identifies the main body length violation",
-        "Keeps the skill as a router rather than flattening all guidance",
-        "Moves provider-specific details into separate references",
-        "Preserves existing capability and trigger intent",
-        "Main SKILL.md remains under the instruction budget"
+        "Routes to authoring guidance",
+        "Produces or revises SKILL.md instructions",
+        "Keeps metadata and body budgets in mind",
+        "Uses references only when they reduce main-file complexity",
+        "Avoids placeholder bundled resources",
+        "Preserves portability and safety"
       ]
     },
     {
       "id": 3,
-      "prompt": "Generate eval cases for a router skill that writes docs: README, API docs, changelog, release notes, and runbooks. Include confusing near-misses.",
-      "expected_output": "An eval set with route coverage, realistic prompts, expected outputs, and near-miss cases that test routing boundaries.",
+      "reference": "references/authoring.md",
+      "prompt": "Create a skill for CSV cleanup with trigger description, workflow, references, and evals.",
+      "expected_output": "A response that routes to references/authoring.md and applies the creator-skill workflow for the requested task.",
       "files": [],
       "expectations": [
-        "Includes multiple eval prompts per route or explains coverage tradeoff",
-        "Includes near-miss prompts that could route incorrectly",
-        "Each eval has expected_output and files fields",
-        "Expectations are objectively gradeable",
-        "Eval file belongs inside the skill folder"
+        "Routes to authoring guidance",
+        "Produces or revises SKILL.md instructions",
+        "Keeps metadata and body budgets in mind",
+        "Uses references only when they reduce main-file complexity",
+        "Avoids placeholder bundled resources",
+        "Preserves portability and safety"
       ]
     },
     {
       "id": 4,
-      "prompt": "Optimize this skill description for trigger accuracy. The skill generates database schemas and SQL queries. Make sure the resulting metadata stays within the new budget.",
-      "expected_output": "A concise optimized description and a trigger-eval approach that respects the metadata length budget.",
+      "reference": "references/authoring.md",
+      "prompt": "Revise a skill so its frontmatter stays under 100 tokens and the body under 500 lines.",
+      "expected_output": "A response that routes to references/authoring.md and applies the creator-skill workflow for the requested task.",
       "files": [],
       "expectations": [
-        "Description stays under 100 tokens",
-        "Description includes core trigger intent",
-        "Does not include a long keyword inventory",
-        "Suggests should-trigger and should-not-trigger evals",
-        "Avoids model/runtime overrides unless requested"
+        "Routes to authoring guidance",
+        "Produces or revises SKILL.md instructions",
+        "Keeps metadata and body budgets in mind",
+        "Uses references only when they reduce main-file complexity",
+        "Avoids placeholder bundled resources",
+        "Preserves portability and safety"
       ]
     },
     {
       "id": 5,
-      "prompt": "Package this completed skill and explain what should be validated first. The skill has scripts, references, and evals.",
-      "expected_output": "A packaging workflow that validates the skill, preserves canonical SKILL.md structure, and uses the packaging script when available.",
+      "reference": "references/authoring.md",
+      "prompt": "Turn a long provider-specific skill into progressive-disclosure references.",
+      "expected_output": "A response that routes to references/authoring.md and applies the creator-skill workflow for the requested task.",
       "files": [],
       "expectations": [
-        "Runs or recommends quick validation before packaging",
-        "Uses scripts/package_skill.py when available",
-        "Preserves the original skill name",
-        "Does not package stale iteration output unnecessarily",
-        "Reports the resulting artifact path or blocker"
+        "Routes to authoring guidance",
+        "Produces or revises SKILL.md instructions",
+        "Keeps metadata and body budgets in mind",
+        "Uses references only when they reduce main-file complexity",
+        "Avoids placeholder bundled resources",
+        "Preserves portability and safety"
       ]
     },
     {
       "id": 6,
-      "prompt": "Review the new agent skill I created in .agents/skills/data-cleanup before I publish it.",
-      "expected_output": "A skill-focused review that inspects the skill folder and reports actionable findings about trigger precision, scope, instructions, progressive disclosure, validation, and eval coverage.",
+      "reference": "references/authoring.md",
+      "prompt": "Create a new skill folder and avoid placeholder scripts/assets/evals.",
+      "expected_output": "A response that routes to references/authoring.md and applies the creator-skill workflow for the requested task.",
       "files": [],
       "expectations": [
-        "Treats the request as a review, not as a request to rewrite the skill",
-        "Inspects the full skill folder, including SKILL.md, references, scripts, assets, and evals when present",
-        "Loads or follows references/review.md",
-        "Leads with actionable findings grounded in specific skill files and line references"
+        "Routes to authoring guidance",
+        "Produces or revises SKILL.md instructions",
+        "Keeps metadata and body budgets in mind",
+        "Uses references only when they reduce main-file complexity",
+        "Avoids placeholder bundled resources",
+        "Preserves portability and safety"
       ]
     },
     {
       "id": 7,
-      "prompt": "Review this skill description: `description: Process files.` It should trigger for CSV cleanup, normalization, and deduplication tasks but not for generic file conversion.",
-      "expected_output": "A trigger-description review that identifies overbroad or underspecified triggering risk and suggests a precise fix direction.",
+      "reference": "references/authoring.md",
+      "prompt": "Update a skill to add section delimiters and bold principle sentences.",
+      "expected_output": "A response that routes to references/authoring.md and applies the creator-skill workflow for the requested task.",
       "files": [],
       "expectations": [
-        "Recognizes the target as an agent skill trigger-description review",
-        "Loads or follows references/review.md for trigger-description criteria",
-        "Checks should-trigger and should-not-trigger boundaries instead of keyword stuffing",
-        "Explains the likely false-negative or false-positive behavior and a concise fix direction"
+        "Routes to authoring guidance",
+        "Produces or revises SKILL.md instructions",
+        "Keeps metadata and body budgets in mind",
+        "Uses references only when they reduce main-file complexity",
+        "Avoids placeholder bundled resources",
+        "Preserves portability and safety"
       ]
     },
     {
       "id": 8,
-      "prompt": "Update an existing skill to match our preferred Markdown style: every `##` section should open with a bold principle sentence, and rule lists should use bold labels as scan anchors when each item is a distinct rule.",
-      "expected_output": "A focused style update that applies bold principle sentences and bold-labeled rule bullets without changing the skill's behavior, schemas, command examples, or helper-agent contracts unnecessarily.",
+      "reference": "references/authoring.md",
+      "prompt": "Create a portable skill that avoids runtime-specific slash commands.",
+      "expected_output": "A response that routes to references/authoring.md and applies the creator-skill workflow for the requested task.",
       "files": [],
       "expectations": [
-        "Every edited `##` section opens with a single bold principle sentence",
-        "Distinct rule bullets use bold labels as scan anchors",
-        "Schemas, command examples, and literal output templates are not restyled unnecessarily",
-        "The update preserves the skill's behavior and routing semantics",
-        "The result is checked with a deterministic Markdown validator or diff review"
+        "Routes to authoring guidance",
+        "Produces or revises SKILL.md instructions",
+        "Keeps metadata and body budgets in mind",
+        "Uses references only when they reduce main-file complexity",
+        "Avoids placeholder bundled resources",
+        "Preserves portability and safety"
+      ]
+    },
+    {
+      "id": 9,
+      "reference": "references/review.md",
+      "prompt": "Review the new agent skill in .agents/skills/data-cleanup before publishing.",
+      "expected_output": "A response that routes to references/review.md and applies the creator-skill workflow for the requested task.",
+      "files": [],
+      "expectations": [
+        "Routes to review guidance",
+        "Inspects the skill folder before judging",
+        "Leads with actionable findings",
+        "Checks trigger precision, scope, references, and eval coverage",
+        "Grounds findings in specific files or lines",
+        "Avoids rewriting unless requested"
+      ]
+    },
+    {
+      "id": 10,
+      "reference": "references/review.md",
+      "prompt": "Review this skill description: description: Process files.",
+      "expected_output": "A response that routes to references/review.md and applies the creator-skill workflow for the requested task.",
+      "files": [],
+      "expectations": [
+        "Routes to review guidance",
+        "Inspects the skill folder before judging",
+        "Leads with actionable findings",
+        "Checks trigger precision, scope, references, and eval coverage",
+        "Grounds findings in specific files or lines",
+        "Avoids rewriting unless requested"
+      ]
+    },
+    {
+      "id": 11,
+      "reference": "references/review.md",
+      "prompt": "Review a skill for trigger overlap against writer-tech-docs and writer-spec.",
+      "expected_output": "A response that routes to references/review.md and applies the creator-skill workflow for the requested task.",
+      "files": [],
+      "expectations": [
+        "Routes to review guidance",
+        "Inspects the skill folder before judging",
+        "Leads with actionable findings",
+        "Checks trigger precision, scope, references, and eval coverage",
+        "Grounds findings in specific files or lines",
+        "Avoids rewriting unless requested"
+      ]
+    },
+    {
+      "id": 12,
+      "reference": "references/review.md",
+      "prompt": "Find quality issues in a skill that has stale references and no evals.",
+      "expected_output": "A response that routes to references/review.md and applies the creator-skill workflow for the requested task.",
+      "files": [],
+      "expectations": [
+        "Routes to review guidance",
+        "Inspects the skill folder before judging",
+        "Leads with actionable findings",
+        "Checks trigger precision, scope, references, and eval coverage",
+        "Grounds findings in specific files or lines",
+        "Avoids rewriting unless requested"
+      ]
+    },
+    {
+      "id": 13,
+      "reference": "references/review.md",
+      "prompt": "Review whether a skill is too broad and should be split into multiple skills.",
+      "expected_output": "A response that routes to references/review.md and applies the creator-skill workflow for the requested task.",
+      "files": [],
+      "expectations": [
+        "Routes to review guidance",
+        "Inspects the skill folder before judging",
+        "Leads with actionable findings",
+        "Checks trigger precision, scope, references, and eval coverage",
+        "Grounds findings in specific files or lines",
+        "Avoids rewriting unless requested"
+      ]
+    },
+    {
+      "id": 14,
+      "reference": "references/review.md",
+      "prompt": "Review a skill package for hidden credential capture or exfiltration risk.",
+      "expected_output": "A response that routes to references/review.md and applies the creator-skill workflow for the requested task.",
+      "files": [],
+      "expectations": [
+        "Routes to review guidance",
+        "Inspects the skill folder before judging",
+        "Leads with actionable findings",
+        "Checks trigger precision, scope, references, and eval coverage",
+        "Grounds findings in specific files or lines",
+        "Avoids rewriting unless requested"
+      ]
+    },
+    {
+      "id": 15,
+      "reference": "references/review.md",
+      "prompt": "Review a router skill for missing route instructions and weak output format.",
+      "expected_output": "A response that routes to references/review.md and applies the creator-skill workflow for the requested task.",
+      "files": [],
+      "expectations": [
+        "Routes to review guidance",
+        "Inspects the skill folder before judging",
+        "Leads with actionable findings",
+        "Checks trigger precision, scope, references, and eval coverage",
+        "Grounds findings in specific files or lines",
+        "Avoids rewriting unless requested"
+      ]
+    },
+    {
+      "id": 16,
+      "reference": "references/review.md",
+      "prompt": "Review a skill after a style-only patch and identify behavioral drift.",
+      "expected_output": "A response that routes to references/review.md and applies the creator-skill workflow for the requested task.",
+      "files": [],
+      "expectations": [
+        "Routes to review guidance",
+        "Inspects the skill folder before judging",
+        "Leads with actionable findings",
+        "Checks trigger precision, scope, references, and eval coverage",
+        "Grounds findings in specific files or lines",
+        "Avoids rewriting unless requested"
+      ]
+    },
+    {
+      "id": 17,
+      "reference": "references/evaluation.md",
+      "prompt": "Generate eval cases for a router skill that writes README, API docs, changelog, release notes, and runbooks.",
+      "expected_output": "A response that routes to references/evaluation.md and applies the creator-skill workflow for the requested task.",
+      "files": [],
+      "expectations": [
+        "Routes to evaluation guidance",
+        "Creates realistic prompt-level eval cases",
+        "Includes route or trigger boundary coverage",
+        "Uses objective expectations where possible",
+        "Keeps evals inside the skill folder",
+        "Mentions reproducible iteration or benchmark workflow when relevant"
+      ]
+    },
+    {
+      "id": 18,
+      "reference": "references/evaluation.md",
+      "prompt": "Create evals for a focused skill that writes incident postmortems.",
+      "expected_output": "A response that routes to references/evaluation.md and applies the creator-skill workflow for the requested task.",
+      "files": [],
+      "expectations": [
+        "Routes to evaluation guidance",
+        "Creates realistic prompt-level eval cases",
+        "Includes route or trigger boundary coverage",
+        "Uses objective expectations where possible",
+        "Keeps evals inside the skill folder",
+        "Mentions reproducible iteration or benchmark workflow when relevant"
+      ]
+    },
+    {
+      "id": 19,
+      "reference": "references/evaluation.md",
+      "prompt": "Build near-miss trigger evals for a database skill versus report-db-health.",
+      "expected_output": "A response that routes to references/evaluation.md and applies the creator-skill workflow for the requested task.",
+      "files": [],
+      "expectations": [
+        "Routes to evaluation guidance",
+        "Creates realistic prompt-level eval cases",
+        "Includes route or trigger boundary coverage",
+        "Uses objective expectations where possible",
+        "Keeps evals inside the skill folder",
+        "Mentions reproducible iteration or benchmark workflow when relevant"
+      ]
+    },
+    {
+      "id": 20,
+      "reference": "references/evaluation.md",
+      "prompt": "Plan an iteration comparing with_skill and without_skill outputs.",
+      "expected_output": "A response that routes to references/evaluation.md and applies the creator-skill workflow for the requested task.",
+      "files": [],
+      "expectations": [
+        "Routes to evaluation guidance",
+        "Creates realistic prompt-level eval cases",
+        "Includes route or trigger boundary coverage",
+        "Uses objective expectations where possible",
+        "Keeps evals inside the skill folder",
+        "Mentions reproducible iteration or benchmark workflow when relevant"
+      ]
+    },
+    {
+      "id": 21,
+      "reference": "references/evaluation.md",
+      "prompt": "Define objective grading assertions for generated SQL migrations.",
+      "expected_output": "A response that routes to references/evaluation.md and applies the creator-skill workflow for the requested task.",
+      "files": [],
+      "expectations": [
+        "Routes to evaluation guidance",
+        "Creates realistic prompt-level eval cases",
+        "Includes route or trigger boundary coverage",
+        "Uses objective expectations where possible",
+        "Keeps evals inside the skill folder",
+        "Mentions reproducible iteration or benchmark workflow when relevant"
+      ]
+    },
+    {
+      "id": 22,
+      "reference": "references/evaluation.md",
+      "prompt": "Aggregate benchmark results from iteration-2 and summarize pass rates.",
+      "expected_output": "A response that routes to references/evaluation.md and applies the creator-skill workflow for the requested task.",
+      "files": [],
+      "expectations": [
+        "Routes to evaluation guidance",
+        "Creates realistic prompt-level eval cases",
+        "Includes route or trigger boundary coverage",
+        "Uses objective expectations where possible",
+        "Keeps evals inside the skill folder",
+        "Mentions reproducible iteration or benchmark workflow when relevant"
+      ]
+    },
+    {
+      "id": 23,
+      "reference": "references/evaluation.md",
+      "prompt": "Create a human review UI for a skill eval run in a headless environment.",
+      "expected_output": "A response that routes to references/evaluation.md and applies the creator-skill workflow for the requested task.",
+      "files": [],
+      "expectations": [
+        "Routes to evaluation guidance",
+        "Creates realistic prompt-level eval cases",
+        "Includes route or trigger boundary coverage",
+        "Uses objective expectations where possible",
+        "Keeps evals inside the skill folder",
+        "Mentions reproducible iteration or benchmark workflow when relevant"
+      ]
+    },
+    {
+      "id": 24,
+      "reference": "references/evaluation.md",
+      "prompt": "Revise eval cases after repeated failures in route selection.",
+      "expected_output": "A response that routes to references/evaluation.md and applies the creator-skill workflow for the requested task.",
+      "files": [],
+      "expectations": [
+        "Routes to evaluation guidance",
+        "Creates realistic prompt-level eval cases",
+        "Includes route or trigger boundary coverage",
+        "Uses objective expectations where possible",
+        "Keeps evals inside the skill folder",
+        "Mentions reproducible iteration or benchmark workflow when relevant"
+      ]
+    },
+    {
+      "id": 25,
+      "reference": "references/description-optimization.md",
+      "prompt": "Optimize a skill description for database schemas and SQL queries within budget.",
+      "expected_output": "A response that routes to references/description-optimization.md and applies the creator-skill workflow for the requested task.",
+      "files": [],
+      "expectations": [
+        "Routes to description optimization guidance",
+        "Improves trigger precision",
+        "Balances should-trigger and should-not-trigger cases",
+        "Keeps metadata concise",
+        "Avoids keyword stuffing",
+        "Explains false-positive or false-negative risk"
+      ]
+    },
+    {
+      "id": 26,
+      "reference": "references/description-optimization.md",
+      "prompt": "Improve a trigger description that undertriggers for runbook requests.",
+      "expected_output": "A response that routes to references/description-optimization.md and applies the creator-skill workflow for the requested task.",
+      "files": [],
+      "expectations": [
+        "Routes to description optimization guidance",
+        "Improves trigger precision",
+        "Balances should-trigger and should-not-trigger cases",
+        "Keeps metadata concise",
+        "Avoids keyword stuffing",
+        "Explains false-positive or false-negative risk"
+      ]
+    },
+    {
+      "id": 27,
+      "reference": "references/description-optimization.md",
+      "prompt": "Make a description less broad so it does not trigger on generic file conversion.",
+      "expected_output": "A response that routes to references/description-optimization.md and applies the creator-skill workflow for the requested task.",
+      "files": [],
+      "expectations": [
+        "Routes to description optimization guidance",
+        "Improves trigger precision",
+        "Balances should-trigger and should-not-trigger cases",
+        "Keeps metadata concise",
+        "Avoids keyword stuffing",
+        "Explains false-positive or false-negative risk"
+      ]
+    },
+    {
+      "id": 28,
+      "reference": "references/description-optimization.md",
+      "prompt": "Create should-trigger and should-not-trigger examples for a skill description.",
+      "expected_output": "A response that routes to references/description-optimization.md and applies the creator-skill workflow for the requested task.",
+      "files": [],
+      "expectations": [
+        "Routes to description optimization guidance",
+        "Improves trigger precision",
+        "Balances should-trigger and should-not-trigger cases",
+        "Keeps metadata concise",
+        "Avoids keyword stuffing",
+        "Explains false-positive or false-negative risk"
+      ]
+    },
+    {
+      "id": 29,
+      "reference": "references/description-optimization.md",
+      "prompt": "Shorten an overlong skill description without losing core trigger cues.",
+      "expected_output": "A response that routes to references/description-optimization.md and applies the creator-skill workflow for the requested task.",
+      "files": [],
+      "expectations": [
+        "Routes to description optimization guidance",
+        "Improves trigger precision",
+        "Balances should-trigger and should-not-trigger cases",
+        "Keeps metadata concise",
+        "Avoids keyword stuffing",
+        "Explains false-positive or false-negative risk"
+      ]
+    },
+    {
+      "id": 30,
+      "reference": "references/description-optimization.md",
+      "prompt": "Diagnose trigger overlap between writer-prd and writer-spec descriptions.",
+      "expected_output": "A response that routes to references/description-optimization.md and applies the creator-skill workflow for the requested task.",
+      "files": [],
+      "expectations": [
+        "Routes to description optimization guidance",
+        "Improves trigger precision",
+        "Balances should-trigger and should-not-trigger cases",
+        "Keeps metadata concise",
+        "Avoids keyword stuffing",
+        "Explains false-positive or false-negative risk"
+      ]
+    },
+    {
+      "id": 31,
+      "reference": "references/description-optimization.md",
+      "prompt": "Improve a code review skill description for PR, diff, branch, and patch requests.",
+      "expected_output": "A response that routes to references/description-optimization.md and applies the creator-skill workflow for the requested task.",
+      "files": [],
+      "expectations": [
+        "Routes to description optimization guidance",
+        "Improves trigger precision",
+        "Balances should-trigger and should-not-trigger cases",
+        "Keeps metadata concise",
+        "Avoids keyword stuffing",
+        "Explains false-positive or false-negative risk"
+      ]
+    },
+    {
+      "id": 32,
+      "reference": "references/description-optimization.md",
+      "prompt": "Optimize metadata for a router skill without stuffing every route keyword.",
+      "expected_output": "A response that routes to references/description-optimization.md and applies the creator-skill workflow for the requested task.",
+      "files": [],
+      "expectations": [
+        "Routes to description optimization guidance",
+        "Improves trigger precision",
+        "Balances should-trigger and should-not-trigger cases",
+        "Keeps metadata concise",
+        "Avoids keyword stuffing",
+        "Explains false-positive or false-negative risk"
+      ]
+    },
+    {
+      "id": 33,
+      "reference": "references/agent-compatibility.md",
+      "prompt": "Adapt a skill workflow for a generic CLI agent without subagents.",
+      "expected_output": "A response that routes to references/agent-compatibility.md and applies the creator-skill workflow for the requested task.",
+      "files": [],
+      "expectations": [
+        "Routes to compatibility guidance",
+        "Removes or isolates runtime-specific assumptions",
+        "Keeps core workflow portable",
+        "Explains fallback behavior for unavailable features",
+        "Avoids relying on one agent UI or command set",
+        "Preserves skill intent"
+      ]
+    },
+    {
+      "id": 34,
+      "reference": "references/agent-compatibility.md",
+      "prompt": "Port a skill that mentions Claude Code commands to runtime-neutral instructions.",
+      "expected_output": "A response that routes to references/agent-compatibility.md and applies the creator-skill workflow for the requested task.",
+      "files": [],
+      "expectations": [
+        "Routes to compatibility guidance",
+        "Removes or isolates runtime-specific assumptions",
+        "Keeps core workflow portable",
+        "Explains fallback behavior for unavailable features",
+        "Avoids relying on one agent UI or command set",
+        "Preserves skill intent"
+      ]
+    },
+    {
+      "id": 35,
+      "reference": "references/agent-compatibility.md",
+      "prompt": "Document compatibility notes for a skill that uses local scripts.",
+      "expected_output": "A response that routes to references/agent-compatibility.md and applies the creator-skill workflow for the requested task.",
+      "files": [],
+      "expectations": [
+        "Routes to compatibility guidance",
+        "Removes or isolates runtime-specific assumptions",
+        "Keeps core workflow portable",
+        "Explains fallback behavior for unavailable features",
+        "Avoids relying on one agent UI or command set",
+        "Preserves skill intent"
+      ]
+    },
+    {
+      "id": 36,
+      "reference": "references/agent-compatibility.md",
+      "prompt": "Adjust a skill so Browser plugin instructions are isolated from core workflow.",
+      "expected_output": "A response that routes to references/agent-compatibility.md and applies the creator-skill workflow for the requested task.",
+      "files": [],
+      "expectations": [
+        "Routes to compatibility guidance",
+        "Removes or isolates runtime-specific assumptions",
+        "Keeps core workflow portable",
+        "Explains fallback behavior for unavailable features",
+        "Avoids relying on one agent UI or command set",
+        "Preserves skill intent"
+      ]
+    },
+    {
+      "id": 37,
+      "reference": "references/agent-compatibility.md",
+      "prompt": "Make a skill usable in agents that cannot spawn helper agents.",
+      "expected_output": "A response that routes to references/agent-compatibility.md and applies the creator-skill workflow for the requested task.",
+      "files": [],
+      "expectations": [
+        "Routes to compatibility guidance",
+        "Removes or isolates runtime-specific assumptions",
+        "Keeps core workflow portable",
+        "Explains fallback behavior for unavailable features",
+        "Avoids relying on one agent UI or command set",
+        "Preserves skill intent"
+      ]
+    },
+    {
+      "id": 38,
+      "reference": "references/agent-compatibility.md",
+      "prompt": "Replace runtime-specific tool names with portable action descriptions.",
+      "expected_output": "A response that routes to references/agent-compatibility.md and applies the creator-skill workflow for the requested task.",
+      "files": [],
+      "expectations": [
+        "Routes to compatibility guidance",
+        "Removes or isolates runtime-specific assumptions",
+        "Keeps core workflow portable",
+        "Explains fallback behavior for unavailable features",
+        "Avoids relying on one agent UI or command set",
+        "Preserves skill intent"
+      ]
+    },
+    {
+      "id": 39,
+      "reference": "references/agent-compatibility.md",
+      "prompt": "Review a skill for assumptions about event streams and UI affordances.",
+      "expected_output": "A response that routes to references/agent-compatibility.md and applies the creator-skill workflow for the requested task.",
+      "files": [],
+      "expectations": [
+        "Routes to compatibility guidance",
+        "Removes or isolates runtime-specific assumptions",
+        "Keeps core workflow portable",
+        "Explains fallback behavior for unavailable features",
+        "Avoids relying on one agent UI or command set",
+        "Preserves skill intent"
+      ]
+    },
+    {
+      "id": 40,
+      "reference": "references/agent-compatibility.md",
+      "prompt": "Write compatibility guidance for packaging and validation in another CLI runtime.",
+      "expected_output": "A response that routes to references/agent-compatibility.md and applies the creator-skill workflow for the requested task.",
+      "files": [],
+      "expectations": [
+        "Routes to compatibility guidance",
+        "Removes or isolates runtime-specific assumptions",
+        "Keeps core workflow portable",
+        "Explains fallback behavior for unavailable features",
+        "Avoids relying on one agent UI or command set",
+        "Preserves skill intent"
       ]
     }
   ]
diff --git a/.agents/skills/creator-skill/references/authoring.md b/.agents/skills/creator-skill/references/authoring.md
index 41706ac..e91cbc6 100644
--- a/.agents/skills/creator-skill/references/authoring.md
+++ b/.agents/skills/creator-skill/references/authoring.md
@@ -35,6 +35,8 @@ Apply the house Markdown style while writing, not as a later cleanup pass:
 
 After editing, run `creator-skill/scripts/quick_validate.py <target-skill-directory>` when this skill's scripts are available. Treat style failures as authoring bugs, not optional polish.
 
+For router skills with `references/*.md`, create `evals/evals.json` before validation is considered complete. Each eval must include a `reference` field that points to the routed reference, and every non-schema reference must have 8-10 evals. This keeps the router honest instead of giving it one polite smoke test and hoping for the best.
+
 ## Length Budgets
 
 **Respect the runtime context budget before adding detail.**
diff --git a/.agents/skills/creator-skill/references/evaluation.md b/.agents/skills/creator-skill/references/evaluation.md
index ae11663..16cb86b 100644
--- a/.agents/skills/creator-skill/references/evaluation.md
+++ b/.agents/skills/creator-skill/references/evaluation.md
@@ -10,6 +10,8 @@ Create 8-10 realistic prompts for a focused skill. For router skills, create 8-1
 
 Save test cases to `<skill-path>/evals/evals.json`. Keep evals inside the skill folder so prompts, fixtures, outputs, and benchmark history travel with the skill.
 
+For router skills, add a `reference` field to every eval using the exact relative path, such as `references/postgres.md`. Every reference file that the router can load must have 8-10 evals. Near-miss prompts still count toward the route they are intended to test.
+
 Start with prompt-level expectations. Add objective assertions after the test set is agreed or while runs are in progress.
 
 ## Run Iterations
diff --git a/.agents/skills/creator-skill/references/schemas.md b/.agents/skills/creator-skill/references/schemas.md
index 04c5948..8c252c2 100644
--- a/.agents/skills/creator-skill/references/schemas.md
+++ b/.agents/skills/creator-skill/references/schemas.md
@@ -14,6 +14,7 @@ Defines the evals for a skill. Located at `<skill-path>/evals/evals.json` within
   "evals": [
     {
       "id": 1,
+      "reference": "references/example-route.md",
       "prompt": "User's example prompt",
       "expected_output": "Description of expected result",
       "files": ["evals/files/sample1.pdf"],
@@ -27,6 +28,7 @@ Defines the evals for a skill. Located at `<skill-path>/evals/evals.json` within
 
 - `skill_name`: Name matching the skill's frontmatter
 - `evals[].id`: Unique integer identifier
+- `evals[].reference`: Required for router skills; exact relative path to the routed reference, such as `references/postgres.md`
 - `evals[].prompt`: The task to execute
 - `evals[].expected_output`: Human-readable description of success
 - `evals[].files`: Optional list of input file paths (relative to skill root)
diff --git a/.agents/skills/creator-skill/scripts/quick_validate.py b/.agents/skills/creator-skill/scripts/quick_validate.py
index 7cfbbc5..f68a3c9 100755
--- a/.agents/skills/creator-skill/scripts/quick_validate.py
+++ b/.agents/skills/creator-skill/scripts/quick_validate.py
@@ -6,6 +6,7 @@
 import sys
 import os
 import re
+import json
 import yaml
 from pathlib import Path
 
@@ -58,6 +59,86 @@ def validate_markdown_style(skill_path):
     return True, None
 
 
+def validate_eval_coverage(skill_path):
+    """Validate eval coverage for focused and router skills."""
+    evals_path = skill_path / 'evals' / 'evals.json'
+    references_dir = skill_path / 'references'
+
+    if not evals_path.exists():
+        if references_dir.exists() and any(references_dir.glob('*.md')):
+            return False, "Router skills with references must include evals/evals.json"
+        return True, None
+
+    try:
+        data = json.loads(evals_path.read_text())
+    except json.JSONDecodeError as e:
+        return False, f"Invalid JSON in {evals_path}: {e}"
+
+    evals = data.get('evals')
+    if not isinstance(evals, list):
+        return False, f"{evals_path}: missing list field 'evals'"
+
+    if len(evals) < 2:
+        return False, f"{evals_path}: expected at least 2 evals, found {len(evals)}"
+
+    if not references_dir.exists():
+        return True, None
+
+    reference_files = [
+        f"references/{path.name}"
+        for path in sorted(references_dir.glob('*.md'))
+        if path.name != 'schemas.md'
+    ]
+    if not reference_files:
+        return True, None
+
+    counts = {reference: 0 for reference in reference_files}
+    missing_reference = []
+    unknown_references = {}
+
+    for index, item in enumerate(evals, start=1):
+        if not isinstance(item, dict):
+            return False, f"{evals_path}: eval {index} must be an object"
+        reference = item.get('reference')
+        if not reference:
+            missing_reference.append(str(item.get('id', index)))
+            continue
+        if reference not in counts:
+            unknown_references[str(item.get('id', index))] = reference
+            continue
+        counts[reference] += 1
+
+    if missing_reference:
+        return False, (
+            f"{evals_path}: router evals must include a 'reference' field; "
+            f"missing on eval id(s): {', '.join(missing_reference[:10])}"
+        )
+
+    if unknown_references:
+        examples = ', '.join(
+            f"{eval_id} -> {reference}"
+            for eval_id, reference in list(unknown_references.items())[:5]
+        )
+        return False, f"{evals_path}: evals reference unknown files: {examples}"
+
+    bad_counts = {
+        reference: count
+        for reference, count in counts.items()
+        if count < 8 or count > 10
+    }
+    if bad_counts:
+        summary = ', '.join(
+            f"{reference}={count}"
+            for reference, count in bad_counts.items()
+        )
+        return False, (
+            f"{evals_path}: router references must each have 8-10 evals; "
+            f"found {summary}"
+        )
+
+    return True, None
+
+
 def validate_skill(skill_path):
     """Basic validation of a skill"""
     skill_path = Path(skill_path)
@@ -153,6 +234,10 @@ def validate_skill(skill_path):
     if not valid_style:
         return False, style_message
 
+    valid_evals, eval_message = validate_eval_coverage(skill_path)
+    if not valid_evals:
+        return False, eval_message
+
     return True, "Skill is valid!"
 
 if __name__ == "__main__":