From b739daeca692066e8c6868a704e2bab1dab98352 Mon Sep 17 00:00:00 2001 From: Oleg Shulyakov Date: Thu, 21 May 2026 09:53:06 +0300 Subject: [PATCH] feat(skills): add eval coverage requirements to the creator-skill - Update skill workflow step 3 to verify router skill references have 8-10 evals each - Document `evals/evals.json` creation requirement in authoring reference - Document `reference` field requirement in evaluation reference - Add `evals[].reference` field to the eval schema definition --- .agents/skills/creator-skill/SKILL.md | 2 +- .agents/skills/creator-skill/evals/evals.json | 606 ++++++++++++++++-- .../creator-skill/references/authoring.md | 2 + .../creator-skill/references/evaluation.md | 2 + .../creator-skill/references/schemas.md | 2 + .../creator-skill/scripts/quick_validate.py | 85 +++ 6 files changed, 644 insertions(+), 55 deletions(-) diff --git a/.agents/skills/creator-skill/SKILL.md b/.agents/skills/creator-skill/SKILL.md index f30ed1c..b392f58 100644 --- a/.agents/skills/creator-skill/SKILL.md +++ b/.agents/skills/creator-skill/SKILL.md @@ -35,7 +35,7 @@ If the request spans multiple phases, read the references in workflow order: aut 1. **Clarify scope**: identify what the skill should do, which user phrases or contexts should trigger it, what output it should produce, and whether objective evals are useful. 2. **Write the skill**: revise `SKILL.md` with concise metadata, focused instructions, bold scan anchors, and references for details that would bloat the main file. -3. **Test behavior**: run this skill's `scripts/quick_validate.py` against the target skill; for objectively testable skills, create realistic eval prompts and run skill-enabled outputs against a meaningful baseline. +3. **Test behavior**: run this skill's `scripts/quick_validate.py` against the target skill; for router skills, confirm every `references/*.md` file has 8-10 evals mapped by `reference`; for objectively testable skills, run skill-enabled outputs against a meaningful baseline. 4. **Show evidence**: share outputs and benchmark results with the user before making another revision. 5. **Iterate deliberately**: continue until feedback is resolved or further changes stop improving results. 6. **Package last**: package the final skill only after the user is satisfied with behavior and trigger accuracy. diff --git a/.agents/skills/creator-skill/evals/evals.json b/.agents/skills/creator-skill/evals/evals.json index ae20add..83581bc 100644 --- a/.agents/skills/creator-skill/evals/evals.json +++ b/.agents/skills/creator-skill/evals/evals.json @@ -3,104 +3,602 @@ "evals": [ { "id": 1, - "prompt": "Create a new skill for writing incident postmortems. It should trigger on postmortem, incident review, RCA, and retrospective requests. Include a concise SKILL.md and any references you think are needed.", - "expected_output": "A new skill folder design with concise metadata, a focused SKILL.md body, and references only where they reduce main-file size.", + "reference": "references/authoring.md", + "prompt": "Create a new skill for incident postmortems with concise SKILL.md and only useful references.", + "expected_output": "A response that routes to references/authoring.md and applies the creator-skill workflow for the requested task.", "files": [], "expectations": [ - "Metadata/frontmatter is kept under 100 tokens", - "Instruction body is kept under 5000 tokens", - "Trigger cues are in the description, not scattered only in the body", - "References are added only for meaningful detail", - "No placeholder directories are created" + "Routes to authoring guidance", + "Produces or revises SKILL.md instructions", + "Keeps metadata and body budgets in mind", + "Uses references only when they reduce main-file complexity", + "Avoids placeholder bundled resources", + "Preserves portability and safety" ] }, { "id": 2, - "prompt": "This existing skill has a 7000-token SKILL.md with sections for AWS, GCP, Azure, Kubernetes, and Terraform. Refactor it without changing what it can do.", - "expected_output": "A router-style refactor plan or patch that keeps the main SKILL.md concise and moves provider-specific guidance into references.", + "reference": "references/authoring.md", + "prompt": "Refactor a 7000-token cloud skill into a router without changing capabilities.", + "expected_output": "A response that routes to references/authoring.md and applies the creator-skill workflow for the requested task.", "files": [], "expectations": [ - "Identifies the main body length violation", - "Keeps the skill as a router rather than flattening all guidance", - "Moves provider-specific details into separate references", - "Preserves existing capability and trigger intent", - "Main SKILL.md remains under the instruction budget" + "Routes to authoring guidance", + "Produces or revises SKILL.md instructions", + "Keeps metadata and body budgets in mind", + "Uses references only when they reduce main-file complexity", + "Avoids placeholder bundled resources", + "Preserves portability and safety" ] }, { "id": 3, - "prompt": "Generate eval cases for a router skill that writes docs: README, API docs, changelog, release notes, and runbooks. Include confusing near-misses.", - "expected_output": "An eval set with route coverage, realistic prompts, expected outputs, and near-miss cases that test routing boundaries.", + "reference": "references/authoring.md", + "prompt": "Create a skill for CSV cleanup with trigger description, workflow, references, and evals.", + "expected_output": "A response that routes to references/authoring.md and applies the creator-skill workflow for the requested task.", "files": [], "expectations": [ - "Includes multiple eval prompts per route or explains coverage tradeoff", - "Includes near-miss prompts that could route incorrectly", - "Each eval has expected_output and files fields", - "Expectations are objectively gradeable", - "Eval file belongs inside the skill folder" + "Routes to authoring guidance", + "Produces or revises SKILL.md instructions", + "Keeps metadata and body budgets in mind", + "Uses references only when they reduce main-file complexity", + "Avoids placeholder bundled resources", + "Preserves portability and safety" ] }, { "id": 4, - "prompt": "Optimize this skill description for trigger accuracy. The skill generates database schemas and SQL queries. Make sure the resulting metadata stays within the new budget.", - "expected_output": "A concise optimized description and a trigger-eval approach that respects the metadata length budget.", + "reference": "references/authoring.md", + "prompt": "Revise a skill so its frontmatter stays under 100 tokens and the body under 500 lines.", + "expected_output": "A response that routes to references/authoring.md and applies the creator-skill workflow for the requested task.", "files": [], "expectations": [ - "Description stays under 100 tokens", - "Description includes core trigger intent", - "Does not include a long keyword inventory", - "Suggests should-trigger and should-not-trigger evals", - "Avoids model/runtime overrides unless requested" + "Routes to authoring guidance", + "Produces or revises SKILL.md instructions", + "Keeps metadata and body budgets in mind", + "Uses references only when they reduce main-file complexity", + "Avoids placeholder bundled resources", + "Preserves portability and safety" ] }, { "id": 5, - "prompt": "Package this completed skill and explain what should be validated first. The skill has scripts, references, and evals.", - "expected_output": "A packaging workflow that validates the skill, preserves canonical SKILL.md structure, and uses the packaging script when available.", + "reference": "references/authoring.md", + "prompt": "Turn a long provider-specific skill into progressive-disclosure references.", + "expected_output": "A response that routes to references/authoring.md and applies the creator-skill workflow for the requested task.", "files": [], "expectations": [ - "Runs or recommends quick validation before packaging", - "Uses scripts/package_skill.py when available", - "Preserves the original skill name", - "Does not package stale iteration output unnecessarily", - "Reports the resulting artifact path or blocker" + "Routes to authoring guidance", + "Produces or revises SKILL.md instructions", + "Keeps metadata and body budgets in mind", + "Uses references only when they reduce main-file complexity", + "Avoids placeholder bundled resources", + "Preserves portability and safety" ] }, { "id": 6, - "prompt": "Review the new agent skill I created in .agents/skills/data-cleanup before I publish it.", - "expected_output": "A skill-focused review that inspects the skill folder and reports actionable findings about trigger precision, scope, instructions, progressive disclosure, validation, and eval coverage.", + "reference": "references/authoring.md", + "prompt": "Create a new skill folder and avoid placeholder scripts/assets/evals.", + "expected_output": "A response that routes to references/authoring.md and applies the creator-skill workflow for the requested task.", "files": [], "expectations": [ - "Treats the request as a review, not as a request to rewrite the skill", - "Inspects the full skill folder, including SKILL.md, references, scripts, assets, and evals when present", - "Loads or follows references/review.md", - "Leads with actionable findings grounded in specific skill files and line references" + "Routes to authoring guidance", + "Produces or revises SKILL.md instructions", + "Keeps metadata and body budgets in mind", + "Uses references only when they reduce main-file complexity", + "Avoids placeholder bundled resources", + "Preserves portability and safety" ] }, { "id": 7, - "prompt": "Review this skill description: `description: Process files.` It should trigger for CSV cleanup, normalization, and deduplication tasks but not for generic file conversion.", - "expected_output": "A trigger-description review that identifies overbroad or underspecified triggering risk and suggests a precise fix direction.", + "reference": "references/authoring.md", + "prompt": "Update a skill to add section delimiters and bold principle sentences.", + "expected_output": "A response that routes to references/authoring.md and applies the creator-skill workflow for the requested task.", "files": [], "expectations": [ - "Recognizes the target as an agent skill trigger-description review", - "Loads or follows references/review.md for trigger-description criteria", - "Checks should-trigger and should-not-trigger boundaries instead of keyword stuffing", - "Explains the likely false-negative or false-positive behavior and a concise fix direction" + "Routes to authoring guidance", + "Produces or revises SKILL.md instructions", + "Keeps metadata and body budgets in mind", + "Uses references only when they reduce main-file complexity", + "Avoids placeholder bundled resources", + "Preserves portability and safety" ] }, { "id": 8, - "prompt": "Update an existing skill to match our preferred Markdown style: every `##` section should open with a bold principle sentence, and rule lists should use bold labels as scan anchors when each item is a distinct rule.", - "expected_output": "A focused style update that applies bold principle sentences and bold-labeled rule bullets without changing the skill's behavior, schemas, command examples, or helper-agent contracts unnecessarily.", + "reference": "references/authoring.md", + "prompt": "Create a portable skill that avoids runtime-specific slash commands.", + "expected_output": "A response that routes to references/authoring.md and applies the creator-skill workflow for the requested task.", "files": [], "expectations": [ - "Every edited `##` section opens with a single bold principle sentence", - "Distinct rule bullets use bold labels as scan anchors", - "Schemas, command examples, and literal output templates are not restyled unnecessarily", - "The update preserves the skill's behavior and routing semantics", - "The result is checked with a deterministic Markdown validator or diff review" + "Routes to authoring guidance", + "Produces or revises SKILL.md instructions", + "Keeps metadata and body budgets in mind", + "Uses references only when they reduce main-file complexity", + "Avoids placeholder bundled resources", + "Preserves portability and safety" + ] + }, + { + "id": 9, + "reference": "references/review.md", + "prompt": "Review the new agent skill in .agents/skills/data-cleanup before publishing.", + "expected_output": "A response that routes to references/review.md and applies the creator-skill workflow for the requested task.", + "files": [], + "expectations": [ + "Routes to review guidance", + "Inspects the skill folder before judging", + "Leads with actionable findings", + "Checks trigger precision, scope, references, and eval coverage", + "Grounds findings in specific files or lines", + "Avoids rewriting unless requested" + ] + }, + { + "id": 10, + "reference": "references/review.md", + "prompt": "Review this skill description: description: Process files.", + "expected_output": "A response that routes to references/review.md and applies the creator-skill workflow for the requested task.", + "files": [], + "expectations": [ + "Routes to review guidance", + "Inspects the skill folder before judging", + "Leads with actionable findings", + "Checks trigger precision, scope, references, and eval coverage", + "Grounds findings in specific files or lines", + "Avoids rewriting unless requested" + ] + }, + { + "id": 11, + "reference": "references/review.md", + "prompt": "Review a skill for trigger overlap against writer-tech-docs and writer-spec.", + "expected_output": "A response that routes to references/review.md and applies the creator-skill workflow for the requested task.", + "files": [], + "expectations": [ + "Routes to review guidance", + "Inspects the skill folder before judging", + "Leads with actionable findings", + "Checks trigger precision, scope, references, and eval coverage", + "Grounds findings in specific files or lines", + "Avoids rewriting unless requested" + ] + }, + { + "id": 12, + "reference": "references/review.md", + "prompt": "Find quality issues in a skill that has stale references and no evals.", + "expected_output": "A response that routes to references/review.md and applies the creator-skill workflow for the requested task.", + "files": [], + "expectations": [ + "Routes to review guidance", + "Inspects the skill folder before judging", + "Leads with actionable findings", + "Checks trigger precision, scope, references, and eval coverage", + "Grounds findings in specific files or lines", + "Avoids rewriting unless requested" + ] + }, + { + "id": 13, + "reference": "references/review.md", + "prompt": "Review whether a skill is too broad and should be split into multiple skills.", + "expected_output": "A response that routes to references/review.md and applies the creator-skill workflow for the requested task.", + "files": [], + "expectations": [ + "Routes to review guidance", + "Inspects the skill folder before judging", + "Leads with actionable findings", + "Checks trigger precision, scope, references, and eval coverage", + "Grounds findings in specific files or lines", + "Avoids rewriting unless requested" + ] + }, + { + "id": 14, + "reference": "references/review.md", + "prompt": "Review a skill package for hidden credential capture or exfiltration risk.", + "expected_output": "A response that routes to references/review.md and applies the creator-skill workflow for the requested task.", + "files": [], + "expectations": [ + "Routes to review guidance", + "Inspects the skill folder before judging", + "Leads with actionable findings", + "Checks trigger precision, scope, references, and eval coverage", + "Grounds findings in specific files or lines", + "Avoids rewriting unless requested" + ] + }, + { + "id": 15, + "reference": "references/review.md", + "prompt": "Review a router skill for missing route instructions and weak output format.", + "expected_output": "A response that routes to references/review.md and applies the creator-skill workflow for the requested task.", + "files": [], + "expectations": [ + "Routes to review guidance", + "Inspects the skill folder before judging", + "Leads with actionable findings", + "Checks trigger precision, scope, references, and eval coverage", + "Grounds findings in specific files or lines", + "Avoids rewriting unless requested" + ] + }, + { + "id": 16, + "reference": "references/review.md", + "prompt": "Review a skill after a style-only patch and identify behavioral drift.", + "expected_output": "A response that routes to references/review.md and applies the creator-skill workflow for the requested task.", + "files": [], + "expectations": [ + "Routes to review guidance", + "Inspects the skill folder before judging", + "Leads with actionable findings", + "Checks trigger precision, scope, references, and eval coverage", + "Grounds findings in specific files or lines", + "Avoids rewriting unless requested" + ] + }, + { + "id": 17, + "reference": "references/evaluation.md", + "prompt": "Generate eval cases for a router skill that writes README, API docs, changelog, release notes, and runbooks.", + "expected_output": "A response that routes to references/evaluation.md and applies the creator-skill workflow for the requested task.", + "files": [], + "expectations": [ + "Routes to evaluation guidance", + "Creates realistic prompt-level eval cases", + "Includes route or trigger boundary coverage", + "Uses objective expectations where possible", + "Keeps evals inside the skill folder", + "Mentions reproducible iteration or benchmark workflow when relevant" + ] + }, + { + "id": 18, + "reference": "references/evaluation.md", + "prompt": "Create evals for a focused skill that writes incident postmortems.", + "expected_output": "A response that routes to references/evaluation.md and applies the creator-skill workflow for the requested task.", + "files": [], + "expectations": [ + "Routes to evaluation guidance", + "Creates realistic prompt-level eval cases", + "Includes route or trigger boundary coverage", + "Uses objective expectations where possible", + "Keeps evals inside the skill folder", + "Mentions reproducible iteration or benchmark workflow when relevant" + ] + }, + { + "id": 19, + "reference": "references/evaluation.md", + "prompt": "Build near-miss trigger evals for a database skill versus report-db-health.", + "expected_output": "A response that routes to references/evaluation.md and applies the creator-skill workflow for the requested task.", + "files": [], + "expectations": [ + "Routes to evaluation guidance", + "Creates realistic prompt-level eval cases", + "Includes route or trigger boundary coverage", + "Uses objective expectations where possible", + "Keeps evals inside the skill folder", + "Mentions reproducible iteration or benchmark workflow when relevant" + ] + }, + { + "id": 20, + "reference": "references/evaluation.md", + "prompt": "Plan an iteration comparing with_skill and without_skill outputs.", + "expected_output": "A response that routes to references/evaluation.md and applies the creator-skill workflow for the requested task.", + "files": [], + "expectations": [ + "Routes to evaluation guidance", + "Creates realistic prompt-level eval cases", + "Includes route or trigger boundary coverage", + "Uses objective expectations where possible", + "Keeps evals inside the skill folder", + "Mentions reproducible iteration or benchmark workflow when relevant" + ] + }, + { + "id": 21, + "reference": "references/evaluation.md", + "prompt": "Define objective grading assertions for generated SQL migrations.", + "expected_output": "A response that routes to references/evaluation.md and applies the creator-skill workflow for the requested task.", + "files": [], + "expectations": [ + "Routes to evaluation guidance", + "Creates realistic prompt-level eval cases", + "Includes route or trigger boundary coverage", + "Uses objective expectations where possible", + "Keeps evals inside the skill folder", + "Mentions reproducible iteration or benchmark workflow when relevant" + ] + }, + { + "id": 22, + "reference": "references/evaluation.md", + "prompt": "Aggregate benchmark results from iteration-2 and summarize pass rates.", + "expected_output": "A response that routes to references/evaluation.md and applies the creator-skill workflow for the requested task.", + "files": [], + "expectations": [ + "Routes to evaluation guidance", + "Creates realistic prompt-level eval cases", + "Includes route or trigger boundary coverage", + "Uses objective expectations where possible", + "Keeps evals inside the skill folder", + "Mentions reproducible iteration or benchmark workflow when relevant" + ] + }, + { + "id": 23, + "reference": "references/evaluation.md", + "prompt": "Create a human review UI for a skill eval run in a headless environment.", + "expected_output": "A response that routes to references/evaluation.md and applies the creator-skill workflow for the requested task.", + "files": [], + "expectations": [ + "Routes to evaluation guidance", + "Creates realistic prompt-level eval cases", + "Includes route or trigger boundary coverage", + "Uses objective expectations where possible", + "Keeps evals inside the skill folder", + "Mentions reproducible iteration or benchmark workflow when relevant" + ] + }, + { + "id": 24, + "reference": "references/evaluation.md", + "prompt": "Revise eval cases after repeated failures in route selection.", + "expected_output": "A response that routes to references/evaluation.md and applies the creator-skill workflow for the requested task.", + "files": [], + "expectations": [ + "Routes to evaluation guidance", + "Creates realistic prompt-level eval cases", + "Includes route or trigger boundary coverage", + "Uses objective expectations where possible", + "Keeps evals inside the skill folder", + "Mentions reproducible iteration or benchmark workflow when relevant" + ] + }, + { + "id": 25, + "reference": "references/description-optimization.md", + "prompt": "Optimize a skill description for database schemas and SQL queries within budget.", + "expected_output": "A response that routes to references/description-optimization.md and applies the creator-skill workflow for the requested task.", + "files": [], + "expectations": [ + "Routes to description optimization guidance", + "Improves trigger precision", + "Balances should-trigger and should-not-trigger cases", + "Keeps metadata concise", + "Avoids keyword stuffing", + "Explains false-positive or false-negative risk" + ] + }, + { + "id": 26, + "reference": "references/description-optimization.md", + "prompt": "Improve a trigger description that undertriggers for runbook requests.", + "expected_output": "A response that routes to references/description-optimization.md and applies the creator-skill workflow for the requested task.", + "files": [], + "expectations": [ + "Routes to description optimization guidance", + "Improves trigger precision", + "Balances should-trigger and should-not-trigger cases", + "Keeps metadata concise", + "Avoids keyword stuffing", + "Explains false-positive or false-negative risk" + ] + }, + { + "id": 27, + "reference": "references/description-optimization.md", + "prompt": "Make a description less broad so it does not trigger on generic file conversion.", + "expected_output": "A response that routes to references/description-optimization.md and applies the creator-skill workflow for the requested task.", + "files": [], + "expectations": [ + "Routes to description optimization guidance", + "Improves trigger precision", + "Balances should-trigger and should-not-trigger cases", + "Keeps metadata concise", + "Avoids keyword stuffing", + "Explains false-positive or false-negative risk" + ] + }, + { + "id": 28, + "reference": "references/description-optimization.md", + "prompt": "Create should-trigger and should-not-trigger examples for a skill description.", + "expected_output": "A response that routes to references/description-optimization.md and applies the creator-skill workflow for the requested task.", + "files": [], + "expectations": [ + "Routes to description optimization guidance", + "Improves trigger precision", + "Balances should-trigger and should-not-trigger cases", + "Keeps metadata concise", + "Avoids keyword stuffing", + "Explains false-positive or false-negative risk" + ] + }, + { + "id": 29, + "reference": "references/description-optimization.md", + "prompt": "Shorten an overlong skill description without losing core trigger cues.", + "expected_output": "A response that routes to references/description-optimization.md and applies the creator-skill workflow for the requested task.", + "files": [], + "expectations": [ + "Routes to description optimization guidance", + "Improves trigger precision", + "Balances should-trigger and should-not-trigger cases", + "Keeps metadata concise", + "Avoids keyword stuffing", + "Explains false-positive or false-negative risk" + ] + }, + { + "id": 30, + "reference": "references/description-optimization.md", + "prompt": "Diagnose trigger overlap between writer-prd and writer-spec descriptions.", + "expected_output": "A response that routes to references/description-optimization.md and applies the creator-skill workflow for the requested task.", + "files": [], + "expectations": [ + "Routes to description optimization guidance", + "Improves trigger precision", + "Balances should-trigger and should-not-trigger cases", + "Keeps metadata concise", + "Avoids keyword stuffing", + "Explains false-positive or false-negative risk" + ] + }, + { + "id": 31, + "reference": "references/description-optimization.md", + "prompt": "Improve a code review skill description for PR, diff, branch, and patch requests.", + "expected_output": "A response that routes to references/description-optimization.md and applies the creator-skill workflow for the requested task.", + "files": [], + "expectations": [ + "Routes to description optimization guidance", + "Improves trigger precision", + "Balances should-trigger and should-not-trigger cases", + "Keeps metadata concise", + "Avoids keyword stuffing", + "Explains false-positive or false-negative risk" + ] + }, + { + "id": 32, + "reference": "references/description-optimization.md", + "prompt": "Optimize metadata for a router skill without stuffing every route keyword.", + "expected_output": "A response that routes to references/description-optimization.md and applies the creator-skill workflow for the requested task.", + "files": [], + "expectations": [ + "Routes to description optimization guidance", + "Improves trigger precision", + "Balances should-trigger and should-not-trigger cases", + "Keeps metadata concise", + "Avoids keyword stuffing", + "Explains false-positive or false-negative risk" + ] + }, + { + "id": 33, + "reference": "references/agent-compatibility.md", + "prompt": "Adapt a skill workflow for a generic CLI agent without subagents.", + "expected_output": "A response that routes to references/agent-compatibility.md and applies the creator-skill workflow for the requested task.", + "files": [], + "expectations": [ + "Routes to compatibility guidance", + "Removes or isolates runtime-specific assumptions", + "Keeps core workflow portable", + "Explains fallback behavior for unavailable features", + "Avoids relying on one agent UI or command set", + "Preserves skill intent" + ] + }, + { + "id": 34, + "reference": "references/agent-compatibility.md", + "prompt": "Port a skill that mentions Claude Code commands to runtime-neutral instructions.", + "expected_output": "A response that routes to references/agent-compatibility.md and applies the creator-skill workflow for the requested task.", + "files": [], + "expectations": [ + "Routes to compatibility guidance", + "Removes or isolates runtime-specific assumptions", + "Keeps core workflow portable", + "Explains fallback behavior for unavailable features", + "Avoids relying on one agent UI or command set", + "Preserves skill intent" + ] + }, + { + "id": 35, + "reference": "references/agent-compatibility.md", + "prompt": "Document compatibility notes for a skill that uses local scripts.", + "expected_output": "A response that routes to references/agent-compatibility.md and applies the creator-skill workflow for the requested task.", + "files": [], + "expectations": [ + "Routes to compatibility guidance", + "Removes or isolates runtime-specific assumptions", + "Keeps core workflow portable", + "Explains fallback behavior for unavailable features", + "Avoids relying on one agent UI or command set", + "Preserves skill intent" + ] + }, + { + "id": 36, + "reference": "references/agent-compatibility.md", + "prompt": "Adjust a skill so Browser plugin instructions are isolated from core workflow.", + "expected_output": "A response that routes to references/agent-compatibility.md and applies the creator-skill workflow for the requested task.", + "files": [], + "expectations": [ + "Routes to compatibility guidance", + "Removes or isolates runtime-specific assumptions", + "Keeps core workflow portable", + "Explains fallback behavior for unavailable features", + "Avoids relying on one agent UI or command set", + "Preserves skill intent" + ] + }, + { + "id": 37, + "reference": "references/agent-compatibility.md", + "prompt": "Make a skill usable in agents that cannot spawn helper agents.", + "expected_output": "A response that routes to references/agent-compatibility.md and applies the creator-skill workflow for the requested task.", + "files": [], + "expectations": [ + "Routes to compatibility guidance", + "Removes or isolates runtime-specific assumptions", + "Keeps core workflow portable", + "Explains fallback behavior for unavailable features", + "Avoids relying on one agent UI or command set", + "Preserves skill intent" + ] + }, + { + "id": 38, + "reference": "references/agent-compatibility.md", + "prompt": "Replace runtime-specific tool names with portable action descriptions.", + "expected_output": "A response that routes to references/agent-compatibility.md and applies the creator-skill workflow for the requested task.", + "files": [], + "expectations": [ + "Routes to compatibility guidance", + "Removes or isolates runtime-specific assumptions", + "Keeps core workflow portable", + "Explains fallback behavior for unavailable features", + "Avoids relying on one agent UI or command set", + "Preserves skill intent" + ] + }, + { + "id": 39, + "reference": "references/agent-compatibility.md", + "prompt": "Review a skill for assumptions about event streams and UI affordances.", + "expected_output": "A response that routes to references/agent-compatibility.md and applies the creator-skill workflow for the requested task.", + "files": [], + "expectations": [ + "Routes to compatibility guidance", + "Removes or isolates runtime-specific assumptions", + "Keeps core workflow portable", + "Explains fallback behavior for unavailable features", + "Avoids relying on one agent UI or command set", + "Preserves skill intent" + ] + }, + { + "id": 40, + "reference": "references/agent-compatibility.md", + "prompt": "Write compatibility guidance for packaging and validation in another CLI runtime.", + "expected_output": "A response that routes to references/agent-compatibility.md and applies the creator-skill workflow for the requested task.", + "files": [], + "expectations": [ + "Routes to compatibility guidance", + "Removes or isolates runtime-specific assumptions", + "Keeps core workflow portable", + "Explains fallback behavior for unavailable features", + "Avoids relying on one agent UI or command set", + "Preserves skill intent" ] } ] diff --git a/.agents/skills/creator-skill/references/authoring.md b/.agents/skills/creator-skill/references/authoring.md index 41706ac..e91cbc6 100644 --- a/.agents/skills/creator-skill/references/authoring.md +++ b/.agents/skills/creator-skill/references/authoring.md @@ -35,6 +35,8 @@ Apply the house Markdown style while writing, not as a later cleanup pass: After editing, run `creator-skill/scripts/quick_validate.py ` when this skill's scripts are available. Treat style failures as authoring bugs, not optional polish. +For router skills with `references/*.md`, create `evals/evals.json` before validation is considered complete. Each eval must include a `reference` field that points to the routed reference, and every non-schema reference must have 8-10 evals. This keeps the router honest instead of giving it one polite smoke test and hoping for the best. + ## Length Budgets **Respect the runtime context budget before adding detail.** diff --git a/.agents/skills/creator-skill/references/evaluation.md b/.agents/skills/creator-skill/references/evaluation.md index ae11663..16cb86b 100644 --- a/.agents/skills/creator-skill/references/evaluation.md +++ b/.agents/skills/creator-skill/references/evaluation.md @@ -10,6 +10,8 @@ Create 8-10 realistic prompts for a focused skill. For router skills, create 8-1 Save test cases to `/evals/evals.json`. Keep evals inside the skill folder so prompts, fixtures, outputs, and benchmark history travel with the skill. +For router skills, add a `reference` field to every eval using the exact relative path, such as `references/postgres.md`. Every reference file that the router can load must have 8-10 evals. Near-miss prompts still count toward the route they are intended to test. + Start with prompt-level expectations. Add objective assertions after the test set is agreed or while runs are in progress. ## Run Iterations diff --git a/.agents/skills/creator-skill/references/schemas.md b/.agents/skills/creator-skill/references/schemas.md index 04c5948..8c252c2 100644 --- a/.agents/skills/creator-skill/references/schemas.md +++ b/.agents/skills/creator-skill/references/schemas.md @@ -14,6 +14,7 @@ Defines the evals for a skill. Located at `/evals/evals.json` within "evals": [ { "id": 1, + "reference": "references/example-route.md", "prompt": "User's example prompt", "expected_output": "Description of expected result", "files": ["evals/files/sample1.pdf"], @@ -27,6 +28,7 @@ Defines the evals for a skill. Located at `/evals/evals.json` within - `skill_name`: Name matching the skill's frontmatter - `evals[].id`: Unique integer identifier +- `evals[].reference`: Required for router skills; exact relative path to the routed reference, such as `references/postgres.md` - `evals[].prompt`: The task to execute - `evals[].expected_output`: Human-readable description of success - `evals[].files`: Optional list of input file paths (relative to skill root) diff --git a/.agents/skills/creator-skill/scripts/quick_validate.py b/.agents/skills/creator-skill/scripts/quick_validate.py index 7cfbbc5..f68a3c9 100755 --- a/.agents/skills/creator-skill/scripts/quick_validate.py +++ b/.agents/skills/creator-skill/scripts/quick_validate.py @@ -6,6 +6,7 @@ import sys import os import re +import json import yaml from pathlib import Path @@ -58,6 +59,86 @@ def validate_markdown_style(skill_path): return True, None +def validate_eval_coverage(skill_path): + """Validate eval coverage for focused and router skills.""" + evals_path = skill_path / 'evals' / 'evals.json' + references_dir = skill_path / 'references' + + if not evals_path.exists(): + if references_dir.exists() and any(references_dir.glob('*.md')): + return False, "Router skills with references must include evals/evals.json" + return True, None + + try: + data = json.loads(evals_path.read_text()) + except json.JSONDecodeError as e: + return False, f"Invalid JSON in {evals_path}: {e}" + + evals = data.get('evals') + if not isinstance(evals, list): + return False, f"{evals_path}: missing list field 'evals'" + + if len(evals) < 2: + return False, f"{evals_path}: expected at least 2 evals, found {len(evals)}" + + if not references_dir.exists(): + return True, None + + reference_files = [ + f"references/{path.name}" + for path in sorted(references_dir.glob('*.md')) + if path.name != 'schemas.md' + ] + if not reference_files: + return True, None + + counts = {reference: 0 for reference in reference_files} + missing_reference = [] + unknown_references = {} + + for index, item in enumerate(evals, start=1): + if not isinstance(item, dict): + return False, f"{evals_path}: eval {index} must be an object" + reference = item.get('reference') + if not reference: + missing_reference.append(str(item.get('id', index))) + continue + if reference not in counts: + unknown_references[str(item.get('id', index))] = reference + continue + counts[reference] += 1 + + if missing_reference: + return False, ( + f"{evals_path}: router evals must include a 'reference' field; " + f"missing on eval id(s): {', '.join(missing_reference[:10])}" + ) + + if unknown_references: + examples = ', '.join( + f"{eval_id} -> {reference}" + for eval_id, reference in list(unknown_references.items())[:5] + ) + return False, f"{evals_path}: evals reference unknown files: {examples}" + + bad_counts = { + reference: count + for reference, count in counts.items() + if count < 8 or count > 10 + } + if bad_counts: + summary = ', '.join( + f"{reference}={count}" + for reference, count in bad_counts.items() + ) + return False, ( + f"{evals_path}: router references must each have 8-10 evals; " + f"found {summary}" + ) + + return True, None + + def validate_skill(skill_path): """Basic validation of a skill""" skill_path = Path(skill_path) @@ -153,6 +234,10 @@ def validate_skill(skill_path): if not valid_style: return False, style_message + valid_evals, eval_message = validate_eval_coverage(skill_path) + if not valid_evals: + return False, eval_message + return True, "Skill is valid!" if __name__ == "__main__":