From f4c71a9ee652e9f5643ceb2a00f20eb615ac522e Mon Sep 17 00:00:00 2001 From: Oleg Shulyakov Date: Mon, 25 May 2026 16:24:32 +0300 Subject: [PATCH 1/3] feat(skills): migrate create-skill evals from JSON to YAML with suites/cases - Replace `evals.json` with `evals.yaml` using a suites/cases structure - Add `load_eval_set()` helper supporting both list and suite formats - Update all scripts, references, and docs to reference `.yaml` paths - Update `eval_review.html` to export YAML instead of JSON - Update `.gitignore` to track `evals.yaml` instead of `evals.json` - Bump version to 1.9.0 --- .agents/skills/create-skill/SKILL.md | 4 +- .../create-skill/assets/eval_review.html | 17 +- .agents/skills/create-skill/evals/evals.json | 629 ------------------ .agents/skills/create-skill/evals/evals.yaml | 521 +++++++++++++++ .../references/agent-compatibility.md | 6 +- .../create-skill/references/authoring.md | 2 +- .../references/description-optimization.md | 22 +- .../create-skill/references/evaluation.md | 4 +- .../skills/create-skill/references/schemas.md | 53 +- .../create-skill/scripts/quick_validate.py | 41 +- .../skills/create-skill/scripts/run_eval.py | 32 +- .../skills/create-skill/scripts/run_loop.py | 6 +- .gitignore | 2 +- docs/2026-05-02-team-roles-as-skills/SPEC.md | 6 +- docs/2026-05-20-general-agent-skills/SPEC.md | 8 +- ...US-001-author-standalone-general-skills.md | 2 +- .../US-002-generate-skill-evals.md | 28 +- ...US-003-validate-skills-and-update-index.md | 4 +- 18 files changed, 667 insertions(+), 720 deletions(-) delete mode 100644 .agents/skills/create-skill/evals/evals.json create mode 100644 .agents/skills/create-skill/evals/evals.yaml diff --git a/.agents/skills/create-skill/SKILL.md b/.agents/skills/create-skill/SKILL.md index 54b7fce..c1e4493 100644 --- a/.agents/skills/create-skill/SKILL.md +++ b/.agents/skills/create-skill/SKILL.md @@ -8,7 +8,7 @@ tags: - authoring metadata: author: Anthropic - version: "1.8.0" + version: "1.9.0" source: github.com/anthropics/skills catalog: utility category: meta @@ -31,7 +31,7 @@ Create new skills, review and improve existing skills, evaluate outputs, optimiz | Build eval cases, run iterations, benchmark outputs, or collect human feedback | `references/evaluation.md` | | Optimize a skill description for trigger accuracy | `references/description-optimization.md` | | Adapt the workflow for agents without subagents, Claude Code, generic CLIs, or Cowork | `references/agent-compatibility.md` | - | Validate eval, grading, benchmark, or feedback JSON structures | `references/schemas.md` | + | Validate eval YAML or grading, benchmark, and feedback JSON structures | `references/schemas.md` | If the request spans multiple phases, read the references in workflow order: authoring, review, evaluation, description optimization, then agent compatibility only when platform details matter. diff --git a/.agents/skills/create-skill/assets/eval_review.html b/.agents/skills/create-skill/assets/eval_review.html index 6e1e2c4..15a0fa2 100644 --- a/.agents/skills/create-skill/assets/eval_review.html +++ b/.agents/skills/create-skill/assets/eval_review.html @@ -270,11 +270,24 @@

Eval Set Review: __SKILL_NAME_PLACEHOLDER__

i.query.trim() !== ""); const data = valid.map((i) => ({ query: i.query.trim(), should_trigger: i.should_trigger })); - const blob = new Blob([JSON.stringify(data, null, 2)], { type: "application/json" }); + const yaml = [ + `name: ${JSON.stringify(skillName + " trigger evals")}`, + "suites:", + " trigger-routing:", + " description: Trigger and non-trigger routing checks.", + " cases:", + ...data.flatMap((item, idx) => [ + ` case-${String(idx + 1).padStart(3, "0")}:`, + ` query: ${JSON.stringify(item.query)}`, + ` should_trigger: ${item.should_trigger ? "true" : "false"}`, + ]), + "", + ].join("\n"); + const blob = new Blob([yaml], { type: "application/x-yaml" }); const url = URL.createObjectURL(blob); const a = document.createElement("a"); a.href = url; - a.download = "eval_set.json"; + a.download = "eval_set.yaml"; document.body.appendChild(a); a.click(); document.body.removeChild(a); diff --git a/.agents/skills/create-skill/evals/evals.json b/.agents/skills/create-skill/evals/evals.json deleted file mode 100644 index 0cb84a7..0000000 --- a/.agents/skills/create-skill/evals/evals.json +++ /dev/null @@ -1,629 +0,0 @@ -{ - "skill_name": "create-skill", - "evals": [ - { - "id": 1, - "reference": "references/authoring.md", - "prompt": "Create a new skill for incident postmortems with concise SKILL.md and only useful references.", - "expected_output": "A response that routes to references/authoring.md and applies the create-skill workflow for the requested task.", - "files": [], - "expectations": [ - "Routes to authoring guidance", - "Produces or revises SKILL.md instructions", - "Keeps metadata and body budgets in mind", - "Uses references only when they reduce main-file complexity", - "Uses the reference format that best teaches the behavior instead of defaulting to terse bullets", - "Avoids placeholder bundled resources", - "Preserves portability and safety" - ] - }, - { - "id": 2, - "reference": "references/authoring.md", - "prompt": "Refactor a 7000-token cloud skill into a router without changing capabilities.", - "expected_output": "A response that routes to references/authoring.md and applies the create-skill workflow for the requested task.", - "files": [], - "expectations": [ - "Routes to authoring guidance", - "Produces or revises SKILL.md instructions", - "Keeps metadata and body budgets in mind", - "Uses references only when they reduce main-file complexity", - "Uses the reference format that best teaches the behavior instead of defaulting to terse bullets", - "Avoids placeholder bundled resources", - "Preserves portability and safety" - ] - }, - { - "id": 3, - "reference": "references/authoring.md", - "prompt": "Create a skill for CSV cleanup with trigger description, workflow, references, and evals.", - "expected_output": "A response that routes to references/authoring.md and applies the create-skill workflow for the requested task.", - "files": [], - "expectations": [ - "Routes to authoring guidance", - "Produces or revises SKILL.md instructions", - "Keeps metadata and body budgets in mind", - "Uses references only when they reduce main-file complexity", - "Uses the reference format that best teaches the behavior instead of defaulting to terse bullets", - "Avoids placeholder bundled resources", - "Preserves portability and safety" - ] - }, - { - "id": 4, - "reference": "references/authoring.md", - "prompt": "Revise a skill so its frontmatter stays under 100 tokens and the body under 500 lines.", - "expected_output": "A response that routes to references/authoring.md and applies the create-skill workflow for the requested task.", - "files": [], - "expectations": [ - "Routes to authoring guidance", - "Produces or revises SKILL.md instructions", - "Keeps metadata and body budgets in mind", - "Uses references only when they reduce main-file complexity", - "Uses the reference format that best teaches the behavior instead of defaulting to terse bullets", - "Avoids placeholder bundled resources", - "Preserves portability and safety" - ] - }, - { - "id": 5, - "reference": "references/authoring.md", - "prompt": "Turn a long provider-specific skill into progressive-disclosure references.", - "expected_output": "A response that routes to references/authoring.md and applies the create-skill workflow for the requested task.", - "files": [], - "expectations": [ - "Routes to authoring guidance", - "Produces or revises SKILL.md instructions", - "Keeps metadata and body budgets in mind", - "Uses references only when they reduce main-file complexity", - "Uses the reference format that best teaches the behavior instead of defaulting to terse bullets", - "Avoids placeholder bundled resources", - "Preserves portability and safety" - ] - }, - { - "id": 6, - "reference": "references/authoring.md", - "prompt": "Create a new skill folder and avoid placeholder scripts/assets/evals.", - "expected_output": "A response that routes to references/authoring.md and applies the create-skill workflow for the requested task.", - "files": [], - "expectations": [ - "Routes to authoring guidance", - "Produces or revises SKILL.md instructions", - "Keeps metadata and body budgets in mind", - "Uses references only when they reduce main-file complexity", - "Uses the reference format that best teaches the behavior instead of defaulting to terse bullets", - "Avoids placeholder bundled resources", - "Preserves portability and safety" - ] - }, - { - "id": 7, - "reference": "references/authoring.md", - "prompt": "Update a skill to add section delimiters and bold principle sentences.", - "expected_output": "A response that routes to references/authoring.md and applies the create-skill workflow for the requested task.", - "files": [], - "expectations": [ - "Routes to authoring guidance", - "Produces or revises SKILL.md instructions", - "Keeps metadata and body budgets in mind", - "Uses references only when they reduce main-file complexity", - "Uses the reference format that best teaches the behavior instead of defaulting to terse bullets", - "Avoids placeholder bundled resources", - "Preserves portability and safety" - ] - }, - { - "id": 8, - "reference": "references/authoring.md", - "prompt": "Create a portable skill that avoids runtime-specific slash commands.", - "expected_output": "A response that routes to references/authoring.md and applies the create-skill workflow for the requested task.", - "files": [], - "expectations": [ - "Routes to authoring guidance", - "Produces or revises SKILL.md instructions", - "Keeps metadata and body budgets in mind", - "Uses references only when they reduce main-file complexity", - "Uses the reference format that best teaches the behavior instead of defaulting to terse bullets", - "Avoids placeholder bundled resources", - "Preserves portability and safety" - ] - }, - { - "id": 9, - "reference": "references/review.md", - "prompt": "Review the new agent skill in .agents/skills/data-cleanup before publishing.", - "expected_output": "A response that routes to references/review.md and applies the create-skill workflow for the requested task.", - "files": [], - "expectations": [ - "Routes to review guidance", - "Inspects the skill folder before judging", - "Leads with actionable findings", - "Checks trigger precision, scope, references, and eval coverage", - "Grounds findings in specific files or lines", - "Avoids rewriting unless requested" - ] - }, - { - "id": 10, - "reference": "references/review.md", - "prompt": "Review this skill description: description: Process files.", - "expected_output": "A response that routes to references/review.md and applies the create-skill workflow for the requested task.", - "files": [], - "expectations": [ - "Routes to review guidance", - "Inspects the skill folder before judging", - "Leads with actionable findings", - "Checks trigger precision, scope, references, and eval coverage", - "Grounds findings in specific files or lines", - "Avoids rewriting unless requested" - ] - }, - { - "id": 11, - "reference": "references/review.md", - "prompt": "Review a skill for trigger overlap against write-tech-docs and write-spec.", - "expected_output": "A response that routes to references/review.md and applies the create-skill workflow for the requested task.", - "files": [], - "expectations": [ - "Routes to review guidance", - "Inspects the skill folder before judging", - "Leads with actionable findings", - "Checks trigger precision, scope, references, and eval coverage", - "Grounds findings in specific files or lines", - "Avoids rewriting unless requested" - ] - }, - { - "id": 12, - "reference": "references/review.md", - "prompt": "Find quality issues in a skill that has stale references and no evals.", - "expected_output": "A response that routes to references/review.md and applies the create-skill workflow for the requested task.", - "files": [], - "expectations": [ - "Routes to review guidance", - "Inspects the skill folder before judging", - "Leads with actionable findings", - "Checks trigger precision, scope, references, and eval coverage", - "Grounds findings in specific files or lines", - "Avoids rewriting unless requested" - ] - }, - { - "id": 13, - "reference": "references/review.md", - "prompt": "Review whether a skill is too broad and should be split into multiple skills.", - "expected_output": "A response that routes to references/review.md and applies the create-skill workflow for the requested task.", - "files": [], - "expectations": [ - "Routes to review guidance", - "Inspects the skill folder before judging", - "Leads with actionable findings", - "Checks trigger precision, scope, references, and eval coverage", - "Grounds findings in specific files or lines", - "Avoids rewriting unless requested" - ] - }, - { - "id": 14, - "reference": "references/review.md", - "prompt": "Review a skill package for hidden credential capture or exfiltration risk.", - "expected_output": "A response that routes to references/review.md and applies the create-skill workflow for the requested task.", - "files": [], - "expectations": [ - "Routes to review guidance", - "Inspects the skill folder before judging", - "Leads with actionable findings", - "Checks trigger precision, scope, references, and eval coverage", - "Grounds findings in specific files or lines", - "Avoids rewriting unless requested" - ] - }, - { - "id": 15, - "reference": "references/review.md", - "prompt": "Review a router skill for missing route instructions and weak output format.", - "expected_output": "A response that routes to references/review.md and applies the create-skill workflow for the requested task.", - "files": [], - "expectations": [ - "Routes to review guidance", - "Inspects the skill folder before judging", - "Leads with actionable findings", - "Checks trigger precision, scope, references, and eval coverage", - "Grounds findings in specific files or lines", - "Avoids rewriting unless requested" - ] - }, - { - "id": 16, - "reference": "references/review.md", - "prompt": "Review a skill after a style-only patch and identify behavioral drift.", - "expected_output": "A response that routes to references/review.md and applies the create-skill workflow for the requested task.", - "files": [], - "expectations": [ - "Routes to review guidance", - "Inspects the skill folder before judging", - "Leads with actionable findings", - "Checks trigger precision, scope, references, and eval coverage", - "Grounds findings in specific files or lines", - "Avoids rewriting unless requested" - ] - }, - { - "id": 17, - "reference": "references/evaluation.md", - "prompt": "Generate eval cases for a router skill that writes README, API docs, changelog, release notes, and runbooks.", - "expected_output": "A response that routes to references/evaluation.md and applies the create-skill workflow for the requested task.", - "files": [], - "expectations": [ - "Routes to evaluation guidance", - "Creates realistic prompt-level eval cases", - "Includes route or trigger boundary coverage", - "Derives assertions from the skill contract when objective checks are useful", - "Covers distinct failure modes or input classes without redundant assertions", - "Includes at least one negative assertion for evals with objective checks", - "Keeps evals inside the skill folder", - "Mentions reproducible iteration or benchmark workflow when relevant" - ] - }, - { - "id": 18, - "reference": "references/evaluation.md", - "prompt": "Create evals for a focused skill that writes incident postmortems.", - "expected_output": "A response that routes to references/evaluation.md and applies the create-skill workflow for the requested task.", - "files": [], - "expectations": [ - "Routes to evaluation guidance", - "Creates realistic prompt-level eval cases", - "Includes route or trigger boundary coverage", - "Derives assertions from the skill contract when objective checks are useful", - "Covers distinct failure modes or input classes without redundant assertions", - "Includes at least one negative assertion for evals with objective checks", - "Keeps evals inside the skill folder", - "Mentions reproducible iteration or benchmark workflow when relevant" - ] - }, - { - "id": 19, - "reference": "references/evaluation.md", - "prompt": "Build near-miss trigger evals for a database skill versus report-db-health.", - "expected_output": "A response that routes to references/evaluation.md and applies the create-skill workflow for the requested task.", - "files": [], - "expectations": [ - "Routes to evaluation guidance", - "Creates realistic prompt-level eval cases", - "Includes route or trigger boundary coverage", - "Derives assertions from the skill contract when objective checks are useful", - "Covers distinct failure modes or input classes without redundant assertions", - "Includes at least one negative assertion for evals with objective checks", - "Keeps evals inside the skill folder", - "Mentions reproducible iteration or benchmark workflow when relevant" - ] - }, - { - "id": 20, - "reference": "references/evaluation.md", - "prompt": "Plan an iteration comparing with_skill and without_skill outputs.", - "expected_output": "A response that routes to references/evaluation.md and applies the create-skill workflow for the requested task.", - "files": [], - "expectations": [ - "Routes to evaluation guidance", - "Creates realistic prompt-level eval cases", - "Includes route or trigger boundary coverage", - "Derives assertions from the skill contract when objective checks are useful", - "Covers distinct failure modes or input classes without redundant assertions", - "Includes at least one negative assertion for evals with objective checks", - "Keeps evals inside the skill folder", - "Mentions reproducible iteration or benchmark workflow when relevant" - ] - }, - { - "id": 21, - "reference": "references/evaluation.md", - "prompt": "Define objective grading assertions for generated SQL migrations.", - "expected_output": "A response that routes to references/evaluation.md and applies the create-skill workflow for the requested task.", - "files": [], - "expectations": [ - "Routes to evaluation guidance", - "Creates realistic prompt-level eval cases", - "Includes route or trigger boundary coverage", - "Derives assertions from the skill contract when objective checks are useful", - "Covers distinct failure modes or input classes without redundant assertions", - "Includes at least one negative assertion for evals with objective checks", - "Keeps evals inside the skill folder", - "Mentions reproducible iteration or benchmark workflow when relevant" - ] - }, - { - "id": 22, - "reference": "references/evaluation.md", - "prompt": "Aggregate benchmark results from iteration-2 and summarize pass rates.", - "expected_output": "A response that routes to references/evaluation.md and applies the create-skill workflow for the requested task.", - "files": [], - "expectations": [ - "Routes to evaluation guidance", - "Creates realistic prompt-level eval cases", - "Includes route or trigger boundary coverage", - "Derives assertions from the skill contract when objective checks are useful", - "Covers distinct failure modes or input classes without redundant assertions", - "Includes at least one negative assertion for evals with objective checks", - "Keeps evals inside the skill folder", - "Mentions reproducible iteration or benchmark workflow when relevant" - ] - }, - { - "id": 23, - "reference": "references/evaluation.md", - "prompt": "Create a human review UI for a skill eval run in a headless environment.", - "expected_output": "A response that routes to references/evaluation.md and applies the create-skill workflow for the requested task.", - "files": [], - "expectations": [ - "Routes to evaluation guidance", - "Creates realistic prompt-level eval cases", - "Includes route or trigger boundary coverage", - "Derives assertions from the skill contract when objective checks are useful", - "Covers distinct failure modes or input classes without redundant assertions", - "Includes at least one negative assertion for evals with objective checks", - "Keeps evals inside the skill folder", - "Mentions reproducible iteration or benchmark workflow when relevant" - ] - }, - { - "id": 24, - "reference": "references/evaluation.md", - "prompt": "Revise eval cases after repeated failures in route selection.", - "expected_output": "A response that routes to references/evaluation.md and applies the create-skill workflow for the requested task.", - "files": [], - "expectations": [ - "Routes to evaluation guidance", - "Creates realistic prompt-level eval cases", - "Includes route or trigger boundary coverage", - "Derives assertions from the skill contract when objective checks are useful", - "Covers distinct failure modes or input classes without redundant assertions", - "Includes at least one negative assertion for evals with objective checks", - "Keeps evals inside the skill folder", - "Mentions reproducible iteration or benchmark workflow when relevant" - ] - }, - { - "id": 25, - "reference": "references/description-optimization.md", - "prompt": "Optimize a skill description for database schemas and SQL queries within budget.", - "expected_output": "A response that routes to references/description-optimization.md and applies the create-skill workflow for the requested task.", - "files": [], - "expectations": [ - "Routes to description optimization guidance", - "Improves trigger precision", - "Balances should-trigger and should-not-trigger cases", - "Keeps metadata concise", - "Avoids keyword stuffing", - "Explains false-positive or false-negative risk" - ] - }, - { - "id": 26, - "reference": "references/description-optimization.md", - "prompt": "Improve a trigger description that undertriggers for runbook requests.", - "expected_output": "A response that routes to references/description-optimization.md and applies the create-skill workflow for the requested task.", - "files": [], - "expectations": [ - "Routes to description optimization guidance", - "Improves trigger precision", - "Balances should-trigger and should-not-trigger cases", - "Keeps metadata concise", - "Avoids keyword stuffing", - "Explains false-positive or false-negative risk" - ] - }, - { - "id": 27, - "reference": "references/description-optimization.md", - "prompt": "Make a description less broad so it does not trigger on generic file conversion.", - "expected_output": "A response that routes to references/description-optimization.md and applies the create-skill workflow for the requested task.", - "files": [], - "expectations": [ - "Routes to description optimization guidance", - "Improves trigger precision", - "Balances should-trigger and should-not-trigger cases", - "Keeps metadata concise", - "Avoids keyword stuffing", - "Explains false-positive or false-negative risk" - ] - }, - { - "id": 28, - "reference": "references/description-optimization.md", - "prompt": "Create should-trigger and should-not-trigger examples for a skill description.", - "expected_output": "A response that routes to references/description-optimization.md and applies the create-skill workflow for the requested task.", - "files": [], - "expectations": [ - "Routes to description optimization guidance", - "Improves trigger precision", - "Balances should-trigger and should-not-trigger cases", - "Keeps metadata concise", - "Avoids keyword stuffing", - "Explains false-positive or false-negative risk" - ] - }, - { - "id": 29, - "reference": "references/description-optimization.md", - "prompt": "Shorten an overlong skill description without losing core trigger cues.", - "expected_output": "A response that routes to references/description-optimization.md and applies the create-skill workflow for the requested task.", - "files": [], - "expectations": [ - "Routes to description optimization guidance", - "Improves trigger precision", - "Balances should-trigger and should-not-trigger cases", - "Keeps metadata concise", - "Avoids keyword stuffing", - "Explains false-positive or false-negative risk" - ] - }, - { - "id": 30, - "reference": "references/description-optimization.md", - "prompt": "Diagnose trigger overlap between write-prd and write-spec descriptions.", - "expected_output": "A response that routes to references/description-optimization.md and applies the create-skill workflow for the requested task.", - "files": [], - "expectations": [ - "Routes to description optimization guidance", - "Improves trigger precision", - "Balances should-trigger and should-not-trigger cases", - "Keeps metadata concise", - "Avoids keyword stuffing", - "Explains false-positive or false-negative risk" - ] - }, - { - "id": 31, - "reference": "references/description-optimization.md", - "prompt": "Improve a code review skill description for PR, diff, branch, and patch requests.", - "expected_output": "A response that routes to references/description-optimization.md and applies the create-skill workflow for the requested task.", - "files": [], - "expectations": [ - "Routes to description optimization guidance", - "Improves trigger precision", - "Balances should-trigger and should-not-trigger cases", - "Keeps metadata concise", - "Avoids keyword stuffing", - "Explains false-positive or false-negative risk" - ] - }, - { - "id": 32, - "reference": "references/description-optimization.md", - "prompt": "Optimize metadata for a router skill without stuffing every route keyword.", - "expected_output": "A response that routes to references/description-optimization.md and applies the create-skill workflow for the requested task.", - "files": [], - "expectations": [ - "Routes to description optimization guidance", - "Improves trigger precision", - "Balances should-trigger and should-not-trigger cases", - "Keeps metadata concise", - "Avoids keyword stuffing", - "Explains false-positive or false-negative risk" - ] - }, - { - "id": 33, - "reference": "references/agent-compatibility.md", - "prompt": "Adapt a skill workflow for a generic CLI agent without subagents.", - "expected_output": "A response that routes to references/agent-compatibility.md and applies the create-skill workflow for the requested task.", - "files": [], - "expectations": [ - "Routes to compatibility guidance", - "Removes or isolates runtime-specific assumptions", - "Keeps core workflow portable", - "Explains fallback behavior for unavailable features", - "Avoids relying on one agent UI or command set", - "Preserves skill intent" - ] - }, - { - "id": 34, - "reference": "references/agent-compatibility.md", - "prompt": "Port a skill that mentions Claude Code commands to runtime-neutral instructions.", - "expected_output": "A response that routes to references/agent-compatibility.md and applies the create-skill workflow for the requested task.", - "files": [], - "expectations": [ - "Routes to compatibility guidance", - "Removes or isolates runtime-specific assumptions", - "Keeps core workflow portable", - "Explains fallback behavior for unavailable features", - "Avoids relying on one agent UI or command set", - "Preserves skill intent" - ] - }, - { - "id": 35, - "reference": "references/agent-compatibility.md", - "prompt": "Document compatibility notes for a skill that uses local scripts.", - "expected_output": "A response that routes to references/agent-compatibility.md and applies the create-skill workflow for the requested task.", - "files": [], - "expectations": [ - "Routes to compatibility guidance", - "Removes or isolates runtime-specific assumptions", - "Keeps core workflow portable", - "Explains fallback behavior for unavailable features", - "Avoids relying on one agent UI or command set", - "Preserves skill intent" - ] - }, - { - "id": 36, - "reference": "references/agent-compatibility.md", - "prompt": "Adjust a skill so Browser plugin instructions are isolated from core workflow.", - "expected_output": "A response that routes to references/agent-compatibility.md and applies the create-skill workflow for the requested task.", - "files": [], - "expectations": [ - "Routes to compatibility guidance", - "Removes or isolates runtime-specific assumptions", - "Keeps core workflow portable", - "Explains fallback behavior for unavailable features", - "Avoids relying on one agent UI or command set", - "Preserves skill intent" - ] - }, - { - "id": 37, - "reference": "references/agent-compatibility.md", - "prompt": "Make a skill usable in agents that cannot spawn helper agents.", - "expected_output": "A response that routes to references/agent-compatibility.md and applies the create-skill workflow for the requested task.", - "files": [], - "expectations": [ - "Routes to compatibility guidance", - "Removes or isolates runtime-specific assumptions", - "Keeps core workflow portable", - "Explains fallback behavior for unavailable features", - "Avoids relying on one agent UI or command set", - "Preserves skill intent" - ] - }, - { - "id": 38, - "reference": "references/agent-compatibility.md", - "prompt": "Replace runtime-specific tool names with portable action descriptions.", - "expected_output": "A response that routes to references/agent-compatibility.md and applies the create-skill workflow for the requested task.", - "files": [], - "expectations": [ - "Routes to compatibility guidance", - "Removes or isolates runtime-specific assumptions", - "Keeps core workflow portable", - "Explains fallback behavior for unavailable features", - "Avoids relying on one agent UI or command set", - "Preserves skill intent" - ] - }, - { - "id": 39, - "reference": "references/agent-compatibility.md", - "prompt": "Review a skill for assumptions about event streams and UI affordances.", - "expected_output": "A response that routes to references/agent-compatibility.md and applies the create-skill workflow for the requested task.", - "files": [], - "expectations": [ - "Routes to compatibility guidance", - "Removes or isolates runtime-specific assumptions", - "Keeps core workflow portable", - "Explains fallback behavior for unavailable features", - "Avoids relying on one agent UI or command set", - "Preserves skill intent" - ] - }, - { - "id": 40, - "reference": "references/agent-compatibility.md", - "prompt": "Write compatibility guidance for packaging and validation in another CLI runtime.", - "expected_output": "A response that routes to references/agent-compatibility.md and applies the create-skill workflow for the requested task.", - "files": [], - "expectations": [ - "Routes to compatibility guidance", - "Removes or isolates runtime-specific assumptions", - "Keeps core workflow portable", - "Explains fallback behavior for unavailable features", - "Avoids relying on one agent UI or command set", - "Preserves skill intent" - ] - } - ] -} diff --git a/.agents/skills/create-skill/evals/evals.yaml b/.agents/skills/create-skill/evals/evals.yaml new file mode 100644 index 0000000..644764e --- /dev/null +++ b/.agents/skills/create-skill/evals/evals.yaml @@ -0,0 +1,521 @@ +name: create-skill evals +suites: + authoring: + description: Eval cases routed to references/authoring.md. + cases: + case-001: + reference: references/authoring.md + prompt: Create a new skill for incident postmortems with concise SKILL.md and only useful references. + expected_output: A response that routes to references/authoring.md and applies the create-skill workflow for the requested task. + files: [] + expectations: + - Routes to authoring guidance + - Produces or revises SKILL.md instructions + - Keeps metadata and body budgets in mind + - Uses references only when they reduce main-file complexity + - Uses the reference format that best teaches the behavior instead of defaulting to terse bullets + - Avoids placeholder bundled resources + - Preserves portability and safety + case-002: + reference: references/authoring.md + prompt: Refactor a 7000-token cloud skill into a router without changing capabilities. + expected_output: A response that routes to references/authoring.md and applies the create-skill workflow for the requested task. + files: [] + expectations: + - Routes to authoring guidance + - Produces or revises SKILL.md instructions + - Keeps metadata and body budgets in mind + - Uses references only when they reduce main-file complexity + - Uses the reference format that best teaches the behavior instead of defaulting to terse bullets + - Avoids placeholder bundled resources + - Preserves portability and safety + case-003: + reference: references/authoring.md + prompt: Create a skill for CSV cleanup with trigger description, workflow, references, and evals. + expected_output: A response that routes to references/authoring.md and applies the create-skill workflow for the requested task. + files: [] + expectations: + - Routes to authoring guidance + - Produces or revises SKILL.md instructions + - Keeps metadata and body budgets in mind + - Uses references only when they reduce main-file complexity + - Uses the reference format that best teaches the behavior instead of defaulting to terse bullets + - Avoids placeholder bundled resources + - Preserves portability and safety + case-004: + reference: references/authoring.md + prompt: Revise a skill so its frontmatter stays under 100 tokens and the body under 500 lines. + expected_output: A response that routes to references/authoring.md and applies the create-skill workflow for the requested task. + files: [] + expectations: + - Routes to authoring guidance + - Produces or revises SKILL.md instructions + - Keeps metadata and body budgets in mind + - Uses references only when they reduce main-file complexity + - Uses the reference format that best teaches the behavior instead of defaulting to terse bullets + - Avoids placeholder bundled resources + - Preserves portability and safety + case-005: + reference: references/authoring.md + prompt: Turn a long provider-specific skill into progressive-disclosure references. + expected_output: A response that routes to references/authoring.md and applies the create-skill workflow for the requested task. + files: [] + expectations: + - Routes to authoring guidance + - Produces or revises SKILL.md instructions + - Keeps metadata and body budgets in mind + - Uses references only when they reduce main-file complexity + - Uses the reference format that best teaches the behavior instead of defaulting to terse bullets + - Avoids placeholder bundled resources + - Preserves portability and safety + case-006: + reference: references/authoring.md + prompt: Create a new skill folder and avoid placeholder scripts/assets/evals. + expected_output: A response that routes to references/authoring.md and applies the create-skill workflow for the requested task. + files: [] + expectations: + - Routes to authoring guidance + - Produces or revises SKILL.md instructions + - Keeps metadata and body budgets in mind + - Uses references only when they reduce main-file complexity + - Uses the reference format that best teaches the behavior instead of defaulting to terse bullets + - Avoids placeholder bundled resources + - Preserves portability and safety + case-007: + reference: references/authoring.md + prompt: Update a skill to add section delimiters and bold principle sentences. + expected_output: A response that routes to references/authoring.md and applies the create-skill workflow for the requested task. + files: [] + expectations: + - Routes to authoring guidance + - Produces or revises SKILL.md instructions + - Keeps metadata and body budgets in mind + - Uses references only when they reduce main-file complexity + - Uses the reference format that best teaches the behavior instead of defaulting to terse bullets + - Avoids placeholder bundled resources + - Preserves portability and safety + case-008: + reference: references/authoring.md + prompt: Create a portable skill that avoids runtime-specific slash commands. + expected_output: A response that routes to references/authoring.md and applies the create-skill workflow for the requested task. + files: [] + expectations: + - Routes to authoring guidance + - Produces or revises SKILL.md instructions + - Keeps metadata and body budgets in mind + - Uses references only when they reduce main-file complexity + - Uses the reference format that best teaches the behavior instead of defaulting to terse bullets + - Avoids placeholder bundled resources + - Preserves portability and safety + review: + description: Eval cases routed to references/review.md. + cases: + case-009: + reference: references/review.md + prompt: Review the new agent skill in .agents/skills/data-cleanup before publishing. + expected_output: A response that routes to references/review.md and applies the create-skill workflow for the requested task. + files: [] + expectations: + - Routes to review guidance + - Inspects the skill folder before judging + - Leads with actionable findings + - Checks trigger precision, scope, references, and eval coverage + - Grounds findings in specific files or lines + - Avoids rewriting unless requested + case-010: + reference: references/review.md + prompt: 'Review this skill description: description: Process files.' + expected_output: A response that routes to references/review.md and applies the create-skill workflow for the requested task. + files: [] + expectations: + - Routes to review guidance + - Inspects the skill folder before judging + - Leads with actionable findings + - Checks trigger precision, scope, references, and eval coverage + - Grounds findings in specific files or lines + - Avoids rewriting unless requested + case-011: + reference: references/review.md + prompt: Review a skill for trigger overlap against write-tech-docs and write-spec. + expected_output: A response that routes to references/review.md and applies the create-skill workflow for the requested task. + files: [] + expectations: + - Routes to review guidance + - Inspects the skill folder before judging + - Leads with actionable findings + - Checks trigger precision, scope, references, and eval coverage + - Grounds findings in specific files or lines + - Avoids rewriting unless requested + case-012: + reference: references/review.md + prompt: Find quality issues in a skill that has stale references and no evals. + expected_output: A response that routes to references/review.md and applies the create-skill workflow for the requested task. + files: [] + expectations: + - Routes to review guidance + - Inspects the skill folder before judging + - Leads with actionable findings + - Checks trigger precision, scope, references, and eval coverage + - Grounds findings in specific files or lines + - Avoids rewriting unless requested + case-013: + reference: references/review.md + prompt: Review whether a skill is too broad and should be split into multiple skills. + expected_output: A response that routes to references/review.md and applies the create-skill workflow for the requested task. + files: [] + expectations: + - Routes to review guidance + - Inspects the skill folder before judging + - Leads with actionable findings + - Checks trigger precision, scope, references, and eval coverage + - Grounds findings in specific files or lines + - Avoids rewriting unless requested + case-014: + reference: references/review.md + prompt: Review a skill package for hidden credential capture or exfiltration risk. + expected_output: A response that routes to references/review.md and applies the create-skill workflow for the requested task. + files: [] + expectations: + - Routes to review guidance + - Inspects the skill folder before judging + - Leads with actionable findings + - Checks trigger precision, scope, references, and eval coverage + - Grounds findings in specific files or lines + - Avoids rewriting unless requested + case-015: + reference: references/review.md + prompt: Review a router skill for missing route instructions and weak output format. + expected_output: A response that routes to references/review.md and applies the create-skill workflow for the requested task. + files: [] + expectations: + - Routes to review guidance + - Inspects the skill folder before judging + - Leads with actionable findings + - Checks trigger precision, scope, references, and eval coverage + - Grounds findings in specific files or lines + - Avoids rewriting unless requested + case-016: + reference: references/review.md + prompt: Review a skill after a style-only patch and identify behavioral drift. + expected_output: A response that routes to references/review.md and applies the create-skill workflow for the requested task. + files: [] + expectations: + - Routes to review guidance + - Inspects the skill folder before judging + - Leads with actionable findings + - Checks trigger precision, scope, references, and eval coverage + - Grounds findings in specific files or lines + - Avoids rewriting unless requested + evaluation: + description: Eval cases routed to references/evaluation.md. + cases: + case-017: + reference: references/evaluation.md + prompt: Generate eval cases for a router skill that writes README, API docs, changelog, release notes, and runbooks. + expected_output: A response that routes to references/evaluation.md and applies the create-skill workflow for the requested task. + files: [] + expectations: + - Routes to evaluation guidance + - Creates realistic prompt-level eval cases + - Includes route or trigger boundary coverage + - Derives assertions from the skill contract when objective checks are useful + - Covers distinct failure modes or input classes without redundant assertions + - Includes at least one negative assertion for evals with objective checks + - Keeps evals inside the skill folder + - Mentions reproducible iteration or benchmark workflow when relevant + case-018: + reference: references/evaluation.md + prompt: Create evals for a focused skill that writes incident postmortems. + expected_output: A response that routes to references/evaluation.md and applies the create-skill workflow for the requested task. + files: [] + expectations: + - Routes to evaluation guidance + - Creates realistic prompt-level eval cases + - Includes route or trigger boundary coverage + - Derives assertions from the skill contract when objective checks are useful + - Covers distinct failure modes or input classes without redundant assertions + - Includes at least one negative assertion for evals with objective checks + - Keeps evals inside the skill folder + - Mentions reproducible iteration or benchmark workflow when relevant + case-019: + reference: references/evaluation.md + prompt: Build near-miss trigger evals for a database skill versus report-db-health. + expected_output: A response that routes to references/evaluation.md and applies the create-skill workflow for the requested task. + files: [] + expectations: + - Routes to evaluation guidance + - Creates realistic prompt-level eval cases + - Includes route or trigger boundary coverage + - Derives assertions from the skill contract when objective checks are useful + - Covers distinct failure modes or input classes without redundant assertions + - Includes at least one negative assertion for evals with objective checks + - Keeps evals inside the skill folder + - Mentions reproducible iteration or benchmark workflow when relevant + case-020: + reference: references/evaluation.md + prompt: Plan an iteration comparing with_skill and without_skill outputs. + expected_output: A response that routes to references/evaluation.md and applies the create-skill workflow for the requested task. + files: [] + expectations: + - Routes to evaluation guidance + - Creates realistic prompt-level eval cases + - Includes route or trigger boundary coverage + - Derives assertions from the skill contract when objective checks are useful + - Covers distinct failure modes or input classes without redundant assertions + - Includes at least one negative assertion for evals with objective checks + - Keeps evals inside the skill folder + - Mentions reproducible iteration or benchmark workflow when relevant + case-021: + reference: references/evaluation.md + prompt: Define objective grading assertions for generated SQL migrations. + expected_output: A response that routes to references/evaluation.md and applies the create-skill workflow for the requested task. + files: [] + expectations: + - Routes to evaluation guidance + - Creates realistic prompt-level eval cases + - Includes route or trigger boundary coverage + - Derives assertions from the skill contract when objective checks are useful + - Covers distinct failure modes or input classes without redundant assertions + - Includes at least one negative assertion for evals with objective checks + - Keeps evals inside the skill folder + - Mentions reproducible iteration or benchmark workflow when relevant + case-022: + reference: references/evaluation.md + prompt: Aggregate benchmark results from iteration-2 and summarize pass rates. + expected_output: A response that routes to references/evaluation.md and applies the create-skill workflow for the requested task. + files: [] + expectations: + - Routes to evaluation guidance + - Creates realistic prompt-level eval cases + - Includes route or trigger boundary coverage + - Derives assertions from the skill contract when objective checks are useful + - Covers distinct failure modes or input classes without redundant assertions + - Includes at least one negative assertion for evals with objective checks + - Keeps evals inside the skill folder + - Mentions reproducible iteration or benchmark workflow when relevant + case-023: + reference: references/evaluation.md + prompt: Create a human review UI for a skill eval run in a headless environment. + expected_output: A response that routes to references/evaluation.md and applies the create-skill workflow for the requested task. + files: [] + expectations: + - Routes to evaluation guidance + - Creates realistic prompt-level eval cases + - Includes route or trigger boundary coverage + - Derives assertions from the skill contract when objective checks are useful + - Covers distinct failure modes or input classes without redundant assertions + - Includes at least one negative assertion for evals with objective checks + - Keeps evals inside the skill folder + - Mentions reproducible iteration or benchmark workflow when relevant + case-024: + reference: references/evaluation.md + prompt: Revise eval cases after repeated failures in route selection. + expected_output: A response that routes to references/evaluation.md and applies the create-skill workflow for the requested task. + files: [] + expectations: + - Routes to evaluation guidance + - Creates realistic prompt-level eval cases + - Includes route or trigger boundary coverage + - Derives assertions from the skill contract when objective checks are useful + - Covers distinct failure modes or input classes without redundant assertions + - Includes at least one negative assertion for evals with objective checks + - Keeps evals inside the skill folder + - Mentions reproducible iteration or benchmark workflow when relevant + description-optimization: + description: Eval cases routed to references/description-optimization.md. + cases: + case-025: + reference: references/description-optimization.md + prompt: Optimize a skill description for database schemas and SQL queries within budget. + expected_output: A response that routes to references/description-optimization.md and applies the create-skill workflow for the requested task. + files: [] + expectations: + - Routes to description optimization guidance + - Improves trigger precision + - Balances should-trigger and should-not-trigger cases + - Keeps metadata concise + - Avoids keyword stuffing + - Explains false-positive or false-negative risk + case-026: + reference: references/description-optimization.md + prompt: Improve a trigger description that undertriggers for runbook requests. + expected_output: A response that routes to references/description-optimization.md and applies the create-skill workflow for the requested task. + files: [] + expectations: + - Routes to description optimization guidance + - Improves trigger precision + - Balances should-trigger and should-not-trigger cases + - Keeps metadata concise + - Avoids keyword stuffing + - Explains false-positive or false-negative risk + case-027: + reference: references/description-optimization.md + prompt: Make a description less broad so it does not trigger on generic file conversion. + expected_output: A response that routes to references/description-optimization.md and applies the create-skill workflow for the requested task. + files: [] + expectations: + - Routes to description optimization guidance + - Improves trigger precision + - Balances should-trigger and should-not-trigger cases + - Keeps metadata concise + - Avoids keyword stuffing + - Explains false-positive or false-negative risk + case-028: + reference: references/description-optimization.md + prompt: Create should-trigger and should-not-trigger examples for a skill description. + expected_output: A response that routes to references/description-optimization.md and applies the create-skill workflow for the requested task. + files: [] + expectations: + - Routes to description optimization guidance + - Improves trigger precision + - Balances should-trigger and should-not-trigger cases + - Keeps metadata concise + - Avoids keyword stuffing + - Explains false-positive or false-negative risk + case-029: + reference: references/description-optimization.md + prompt: Shorten an overlong skill description without losing core trigger cues. + expected_output: A response that routes to references/description-optimization.md and applies the create-skill workflow for the requested task. + files: [] + expectations: + - Routes to description optimization guidance + - Improves trigger precision + - Balances should-trigger and should-not-trigger cases + - Keeps metadata concise + - Avoids keyword stuffing + - Explains false-positive or false-negative risk + case-030: + reference: references/description-optimization.md + prompt: Diagnose trigger overlap between write-prd and write-spec descriptions. + expected_output: A response that routes to references/description-optimization.md and applies the create-skill workflow for the requested task. + files: [] + expectations: + - Routes to description optimization guidance + - Improves trigger precision + - Balances should-trigger and should-not-trigger cases + - Keeps metadata concise + - Avoids keyword stuffing + - Explains false-positive or false-negative risk + case-031: + reference: references/description-optimization.md + prompt: Improve a code review skill description for PR, diff, branch, and patch requests. + expected_output: A response that routes to references/description-optimization.md and applies the create-skill workflow for the requested task. + files: [] + expectations: + - Routes to description optimization guidance + - Improves trigger precision + - Balances should-trigger and should-not-trigger cases + - Keeps metadata concise + - Avoids keyword stuffing + - Explains false-positive or false-negative risk + case-032: + reference: references/description-optimization.md + prompt: Optimize metadata for a router skill without stuffing every route keyword. + expected_output: A response that routes to references/description-optimization.md and applies the create-skill workflow for the requested task. + files: [] + expectations: + - Routes to description optimization guidance + - Improves trigger precision + - Balances should-trigger and should-not-trigger cases + - Keeps metadata concise + - Avoids keyword stuffing + - Explains false-positive or false-negative risk + agent-compatibility: + description: Eval cases routed to references/agent-compatibility.md. + cases: + case-033: + reference: references/agent-compatibility.md + prompt: Adapt a skill workflow for a generic CLI agent without subagents. + expected_output: A response that routes to references/agent-compatibility.md and applies the create-skill workflow for the requested task. + files: [] + expectations: + - Routes to compatibility guidance + - Removes or isolates runtime-specific assumptions + - Keeps core workflow portable + - Explains fallback behavior for unavailable features + - Avoids relying on one agent UI or command set + - Preserves skill intent + case-034: + reference: references/agent-compatibility.md + prompt: Port a skill that mentions Claude Code commands to runtime-neutral instructions. + expected_output: A response that routes to references/agent-compatibility.md and applies the create-skill workflow for the requested task. + files: [] + expectations: + - Routes to compatibility guidance + - Removes or isolates runtime-specific assumptions + - Keeps core workflow portable + - Explains fallback behavior for unavailable features + - Avoids relying on one agent UI or command set + - Preserves skill intent + case-035: + reference: references/agent-compatibility.md + prompt: Document compatibility notes for a skill that uses local scripts. + expected_output: A response that routes to references/agent-compatibility.md and applies the create-skill workflow for the requested task. + files: [] + expectations: + - Routes to compatibility guidance + - Removes or isolates runtime-specific assumptions + - Keeps core workflow portable + - Explains fallback behavior for unavailable features + - Avoids relying on one agent UI or command set + - Preserves skill intent + case-036: + reference: references/agent-compatibility.md + prompt: Adjust a skill so Browser plugin instructions are isolated from core workflow. + expected_output: A response that routes to references/agent-compatibility.md and applies the create-skill workflow for the requested task. + files: [] + expectations: + - Routes to compatibility guidance + - Removes or isolates runtime-specific assumptions + - Keeps core workflow portable + - Explains fallback behavior for unavailable features + - Avoids relying on one agent UI or command set + - Preserves skill intent + case-037: + reference: references/agent-compatibility.md + prompt: Make a skill usable in agents that cannot spawn helper agents. + expected_output: A response that routes to references/agent-compatibility.md and applies the create-skill workflow for the requested task. + files: [] + expectations: + - Routes to compatibility guidance + - Removes or isolates runtime-specific assumptions + - Keeps core workflow portable + - Explains fallback behavior for unavailable features + - Avoids relying on one agent UI or command set + - Preserves skill intent + case-038: + reference: references/agent-compatibility.md + prompt: Replace runtime-specific tool names with portable action descriptions. + expected_output: A response that routes to references/agent-compatibility.md and applies the create-skill workflow for the requested task. + files: [] + expectations: + - Routes to compatibility guidance + - Removes or isolates runtime-specific assumptions + - Keeps core workflow portable + - Explains fallback behavior for unavailable features + - Avoids relying on one agent UI or command set + - Preserves skill intent + case-039: + reference: references/agent-compatibility.md + prompt: Review a skill for assumptions about event streams and UI affordances. + expected_output: A response that routes to references/agent-compatibility.md and applies the create-skill workflow for the requested task. + files: [] + expectations: + - Routes to compatibility guidance + - Removes or isolates runtime-specific assumptions + - Keeps core workflow portable + - Explains fallback behavior for unavailable features + - Avoids relying on one agent UI or command set + - Preserves skill intent + case-040: + reference: references/agent-compatibility.md + prompt: Write compatibility guidance for packaging and validation in another CLI runtime. + expected_output: A response that routes to references/agent-compatibility.md and applies the create-skill workflow for the requested task. + files: [] + expectations: + - Routes to compatibility guidance + - Removes or isolates runtime-specific assumptions + - Keeps core workflow portable + - Explains fallback behavior for unavailable features + - Avoids relying on one agent UI or command set + - Preserves skill intent diff --git a/.agents/skills/create-skill/references/agent-compatibility.md b/.agents/skills/create-skill/references/agent-compatibility.md index 848565c..dec7215 100644 --- a/.agents/skills/create-skill/references/agent-compatibility.md +++ b/.agents/skills/create-skill/references/agent-compatibility.md @@ -45,7 +45,7 @@ Claude Code can use native trigger detection through the optimization scripts: ```bash python -m scripts.run_loop \ - --eval-set \ + --eval-set \ --skill-path \ --agent claude-code \ --verbose @@ -61,7 +61,7 @@ Use the generic command adapter unless the agent exposes better trigger telemetr ```bash python -m scripts.run_loop \ - --eval-set \ + --eval-set \ --skill-path \ --agent codex-cli \ --verbose @@ -71,7 +71,7 @@ For CLIs that need arguments or files instead of stdin, use `--agent-command`: ```bash python -m scripts.run_loop \ - --eval-set \ + --eval-set \ --skill-path \ --agent custom \ --agent-command "agent run --input {prompt_file}" \ diff --git a/.agents/skills/create-skill/references/authoring.md b/.agents/skills/create-skill/references/authoring.md index c463836..edc52ee 100644 --- a/.agents/skills/create-skill/references/authoring.md +++ b/.agents/skills/create-skill/references/authoring.md @@ -185,7 +185,7 @@ Move long templates, large examples, fixture files, and generated review assets ## Add Evals -For router skills with `references/*.md`, create `evals/evals.json` before validation is considered complete. +For router skills with `references/*.md`, create `evals/evals.yaml` before validation is considered complete. Each eval must include a `reference` field pointing to the routed reference. Every non-schema reference must have 8-10 evals. Near-miss prompts count toward the route they are intended to test. diff --git a/.agents/skills/create-skill/references/description-optimization.md b/.agents/skills/create-skill/references/description-optimization.md index 518445e..9cd9437 100644 --- a/.agents/skills/create-skill/references/description-optimization.md +++ b/.agents/skills/create-skill/references/description-optimization.md @@ -26,12 +26,12 @@ Examples: ```bash python -m scripts.run_eval \ - --eval-set /evals/trigger-evals.json \ + --eval-set /evals/trigger-evals.yaml \ --skill-path \ --agent codex-cli python -m scripts.run_loop \ - --eval-set /evals/trigger-evals.json \ + --eval-set /evals/trigger-evals.yaml \ --skill-path \ --agent my-agent ``` @@ -40,7 +40,7 @@ For CLIs with unusual invocation shapes, pass `--agent-command` with `{prompt}` ```bash python -m scripts.run_loop \ - --eval-set /evals/trigger-evals.json \ + --eval-set /evals/trigger-evals.yaml \ --skill-path \ --agent custom \ --agent-command "agent run --input {prompt_file}" @@ -56,13 +56,13 @@ Create about 20 realistic queries split between should-trigger and should-not-tr Positive cases should cover varied ways users ask for the skill's core capability. Negative cases should be near misses, not obviously irrelevant prompts. -Save them as: +Save them as YAML: -```json -[ - { "query": "the user prompt", "should_trigger": true }, - { "query": "a near miss", "should_trigger": false } -] +```yaml +- query: the user prompt + should_trigger: true +- query: a near miss + should_trigger: false ``` ### Weak vs strong eval prompts @@ -101,7 +101,7 @@ When possible, show the eval set to the user before running optimization. People Use `assets/eval_review.html` by replacing: -- **Eval data:** replace `__EVAL_DATA_PLACEHOLDER__` with the JSON array +- **Eval data:** replace `__EVAL_DATA_PLACEHOLDER__` with the eval array as JavaScript data - **Skill name:** replace `__SKILL_NAME_PLACEHOLDER__` with the skill name - **Current description:** replace `__SKILL_DESCRIPTION_PLACEHOLDER__` with the current description @@ -115,7 +115,7 @@ Run: ```bash python -m scripts.run_loop \ - --eval-set \ + --eval-set \ --skill-path \ --agent \ --results-dir /evals/description-optimization \ diff --git a/.agents/skills/create-skill/references/evaluation.md b/.agents/skills/create-skill/references/evaluation.md index e8102b5..f03735e 100644 --- a/.agents/skills/create-skill/references/evaluation.md +++ b/.agents/skills/create-skill/references/evaluation.md @@ -6,7 +6,7 @@ Use this reference when creating eval cases, running skill iterations, benchmark Create 8-10 realistic prompts for a focused skill. For router skills, create 8-10 prompts per route and include near-miss prompts that could be confused with another route. -Save test cases to `/evals/evals.json`. Keep evals inside the skill folder so prompts, fixtures, outputs, and benchmark history travel with the skill. +Save test cases to `/evals/evals.yaml`. Keep evals inside the skill folder so prompts, fixtures, outputs, and benchmark history travel with the skill. For router skills, add a `reference` field to every eval using the exact relative path, such as `references/postgres.md`. Every reference file that the router can load must have 8-10 evals. Near-miss prompts still count toward the route they are intended to test. @@ -16,7 +16,7 @@ Start with prompt-level expectations. Add objective assertions after the test se ## Assertion Design -Read this when drafting assertions for a skill's `evals/evals.json`. It answers one question: **what makes an assertion actually useful?** +Read this when drafting assertions for a skill's `evals/evals.yaml`. It answers one question: **what makes an assertion actually useful?** ### Start with the contract, not the test cases diff --git a/.agents/skills/create-skill/references/schemas.md b/.agents/skills/create-skill/references/schemas.md index 1be7126..eaae66c 100644 --- a/.agents/skills/create-skill/references/schemas.md +++ b/.agents/skills/create-skill/references/schemas.md @@ -1,6 +1,6 @@ -# JSON Schemas +# Data Schemas -This document defines the JSON schemas used by skill-creator. +This document defines the YAML and JSON schemas used by skill-creator. --- @@ -31,35 +31,38 @@ metadata: --- -## evals.json +## evals.yaml -Defines the evals for a skill. Located at `/evals/evals.json` within the skill directory. Do not create `evals/` as a sibling of the skill. +Defines the evals for a skill. Located at `/evals/evals.yaml` within the skill directory. Do not create `evals/` as a sibling of the skill. -```json -{ - "skill_name": "example-skill", - "evals": [ - { - "id": 1, - "reference": "references/example-route.md", - "prompt": "User's example prompt", - "expected_output": "Description of expected result", - "files": ["evals/files/sample1.pdf"], - "expectations": ["The output includes X", "The skill used script Y"] - } - ] -} +```yaml +name: example-skill evals +suites: + example-route: + description: Eval cases routed to references/example-route.md. + cases: + basic-routing: + reference: references/example-route.md + prompt: "User's example prompt" + expected_output: Description of expected result + files: + - evals/files/sample1.pdf + expectations: + - The output includes X + - The skill used script Y ``` **Fields:** -- `skill_name`: Name matching the skill's frontmatter -- `evals[].id`: Unique integer identifier -- `evals[].reference`: Required for router skills; exact relative path to the routed reference, such as `references/postgres.md` -- `evals[].prompt`: The task to execute -- `evals[].expected_output`: Human-readable description of success -- `evals[].files`: Optional list of input file paths (relative to skill root) -- `evals[].expectations`: List of verifiable statements +- `name`: Human-readable eval set name +- `suites`: Mapping of suite names to related eval cases +- `suites..description`: Human-readable suite purpose +- `suites..cases`: Mapping of stable case IDs to eval case objects +- `cases..reference`: Required for router skills; exact relative path to the routed reference, such as `references/postgres.md` +- `cases..prompt`: The task to execute +- `cases..expected_output`: Human-readable description of success +- `cases..files`: Optional list of input file paths (relative to skill root) +- `cases..expectations`: List of verifiable statements --- diff --git a/.agents/skills/create-skill/scripts/quick_validate.py b/.agents/skills/create-skill/scripts/quick_validate.py index 058d2fc..9448026 100755 --- a/.agents/skills/create-skill/scripts/quick_validate.py +++ b/.agents/skills/create-skill/scripts/quick_validate.py @@ -6,7 +6,6 @@ import sys import os import re -import json import yaml from pathlib import Path @@ -67,22 +66,36 @@ def validate_markdown_style(skill_path): def validate_eval_coverage(skill_path): """Validate eval coverage for focused and router skills.""" - evals_path = skill_path / 'evals' / 'evals.json' + evals_path = skill_path / 'evals' / 'evals.yaml' references_dir = skill_path / 'references' if not evals_path.exists(): if references_dir.exists() and any(references_dir.glob('*.md')): - return False, "Router skills with references must include evals/evals.json" + return False, "Router skills with references must include evals/evals.yaml" return True, None try: - data = json.loads(evals_path.read_text()) - except json.JSONDecodeError as e: - return False, f"Invalid JSON in {evals_path}: {e}" - - evals = data.get('evals') - if not isinstance(evals, list): - return False, f"{evals_path}: missing list field 'evals'" + data = yaml.safe_load(evals_path.read_text()) + except yaml.YAMLError as e: + return False, f"Invalid YAML in {evals_path}: {e}" + if not isinstance(data, dict): + return False, f"{evals_path}: expected a YAML mapping" + + suites = data.get('suites') + if not isinstance(suites, dict): + return False, f"{evals_path}: missing mapping field 'suites'" + + evals = [] + for suite_name, suite in suites.items(): + if not isinstance(suite, dict): + return False, f"{evals_path}: suite '{suite_name}' must be an object" + cases = suite.get('cases') + if not isinstance(cases, dict): + return False, f"{evals_path}: suite '{suite_name}' missing mapping field 'cases'" + for case_id, item in cases.items(): + if not isinstance(item, dict): + return False, f"{evals_path}: case '{suite_name}.{case_id}' must be an object" + evals.append((f"{suite_name}.{case_id}", item)) if len(evals) < 2: return False, f"{evals_path}: expected at least 2 evals, found {len(evals)}" @@ -102,15 +115,13 @@ def validate_eval_coverage(skill_path): missing_reference = [] unknown_references = {} - for index, item in enumerate(evals, start=1): - if not isinstance(item, dict): - return False, f"{evals_path}: eval {index} must be an object" + for case_id, item in evals: reference = item.get('reference') if not reference: - missing_reference.append(str(item.get('id', index))) + missing_reference.append(case_id) continue if reference not in counts: - unknown_references[str(item.get('id', index))] = reference + unknown_references[case_id] = reference continue counts[reference] += 1 diff --git a/.agents/skills/create-skill/scripts/run_eval.py b/.agents/skills/create-skill/scripts/run_eval.py index 798dcb4..0ccdb04 100755 --- a/.agents/skills/create-skill/scripts/run_eval.py +++ b/.agents/skills/create-skill/scripts/run_eval.py @@ -16,11 +16,39 @@ import uuid from concurrent.futures import ThreadPoolExecutor, as_completed from pathlib import Path +import yaml from scripts.agent_runner import AGENT_COMMAND_PRESETS, resolve_agent_command, run_agent_command from scripts.utils import parse_skill_md +def load_eval_set(path: Path) -> list[dict]: + """Load a YAML eval set, flattening suites/cases when present.""" + data = yaml.safe_load(path.read_text()) + if isinstance(data, list): + return data + if not isinstance(data, dict): + raise ValueError(f"{path}: expected a YAML list or mapping") + suites = data.get("suites") + if suites is None: + evals = data.get("evals") + if isinstance(evals, list): + return evals + raise ValueError(f"{path}: missing 'suites' mapping") + if not isinstance(suites, dict): + raise ValueError(f"{path}: 'suites' must be a mapping") + + evals = [] + for suite_name, suite in suites.items(): + if not isinstance(suite, dict) or not isinstance(suite.get("cases"), dict): + raise ValueError(f"{path}: suite '{suite_name}' must contain a cases mapping") + for case_id, case in suite["cases"].items(): + if not isinstance(case, dict): + raise ValueError(f"{path}: case '{suite_name}.{case_id}' must be a mapping") + evals.append({"id": f"{suite_name}.{case_id}", **case}) + return evals + + def find_project_root() -> Path: """Find the project root by walking up from cwd looking for .claude/. @@ -339,7 +367,7 @@ def run_eval( def main(): parser = argparse.ArgumentParser(description="Run trigger evaluation for a skill description") - parser.add_argument("--eval-set", required=True, help="Path to eval set JSON file") + parser.add_argument("--eval-set", required=True, help="Path to eval set YAML file") parser.add_argument("--skill-path", required=True, help="Path to skill directory") parser.add_argument("--description", default=None, help="Override description to test") parser.add_argument("--num-workers", type=int, default=10, help="Number of parallel workers") @@ -366,7 +394,7 @@ def main(): parser.add_argument("--verbose", action="store_true", help="Print progress to stderr") args = parser.parse_args() - eval_set = json.loads(Path(args.eval_set).read_text()) + eval_set = load_eval_set(Path(args.eval_set)) skill_path = Path(args.skill_path) if not (skill_path / "SKILL.md").exists(): diff --git a/.agents/skills/create-skill/scripts/run_loop.py b/.agents/skills/create-skill/scripts/run_loop.py index 7f3df92..8757db6 100755 --- a/.agents/skills/create-skill/scripts/run_loop.py +++ b/.agents/skills/create-skill/scripts/run_loop.py @@ -18,7 +18,7 @@ from scripts.agent_runner import AGENT_COMMAND_PRESETS from scripts.generate_report import generate_html from scripts.improve_description import improve_description -from scripts.run_eval import find_project_root, run_eval +from scripts.run_eval import find_project_root, load_eval_set, run_eval from scripts.utils import parse_skill_md @@ -249,7 +249,7 @@ def print_eval_stats(label, results, elapsed): def main(): parser = argparse.ArgumentParser(description="Run eval + improve loop") - parser.add_argument("--eval-set", required=True, help="Path to eval set JSON file") + parser.add_argument("--eval-set", required=True, help="Path to eval set YAML file") parser.add_argument("--skill-path", required=True, help="Path to skill directory") parser.add_argument("--description", default=None, help="Override starting description") parser.add_argument("--num-workers", type=int, default=10, help="Number of parallel workers") @@ -280,7 +280,7 @@ def main(): parser.add_argument("--results-dir", default=None, help="Save all outputs (results.json, report.html, log.txt) to a timestamped subdirectory here") args = parser.parse_args() - eval_set = json.loads(Path(args.eval_set).read_text()) + eval_set = load_eval_set(Path(args.eval_set)) skill_path = Path(args.skill_path) if not (skill_path / "SKILL.md").exists(): diff --git a/.gitignore b/.gitignore index 494b2e2..238e954 100644 --- a/.gitignore +++ b/.gitignore @@ -497,4 +497,4 @@ $RECYCLE.BIN/ # Evaluation output quality files .agents/**/evals/* -!.agents/**/evals/evals.json \ No newline at end of file +!.agents/**/evals/evals.yaml diff --git a/docs/2026-05-02-team-roles-as-skills/SPEC.md b/docs/2026-05-02-team-roles-as-skills/SPEC.md index 93b24f6..71fc4cc 100644 --- a/docs/2026-05-02-team-roles-as-skills/SPEC.md +++ b/docs/2026-05-02-team-roles-as-skills/SPEC.md @@ -142,7 +142,7 @@ Runtime plugin hosting, remote skill fetching, marketplace behavior, live Jira/G - `SKILL.md` exists and contains valid YAML frontmatter with `name` and `description`. - `description` states when to trigger the skill and what artifact it produces. -- `evals/evals.json` exists for release-ready skills. +- `evals/evals.yaml` exists for release-ready skills. - `references/` is used only for substantial reusable guidance loaded on demand. - `SKILL.md` stays under 500 lines. @@ -242,7 +242,7 @@ The verb identifies the artifact or action family. The subject identifies the do ├── / │ ├── SKILL.md │ ├── evals/ -│ │ └── evals.json +│ │ └── evals.yaml │ ├── references/ │ │ └── .md │ ├── scripts/ @@ -439,7 +439,7 @@ Each skill should be built with `create-skill` using this sequence: 1. Clarify trigger scope, expected output, routing needs, and eval expectations. 2. Draft `SKILL.md` with frontmatter, concise workflow instructions, clear section headings, and scan anchors. 3. Move reusable detail into `references/` only when needed. -4. Add `evals/evals.json` with required focused or routed coverage. +4. Add `evals/evals.yaml` with required focused or routed coverage. 5. Run `quick_validate.py`. 6. Run or review eval iterations when behavior needs evidence. 7. Review outputs qualitatively and assertions quantitatively where objective checks apply. diff --git a/docs/2026-05-20-general-agent-skills/SPEC.md b/docs/2026-05-20-general-agent-skills/SPEC.md index 37d0b05..5a11e5f 100644 --- a/docs/2026-05-20-general-agent-skills/SPEC.md +++ b/docs/2026-05-20-general-agent-skills/SPEC.md @@ -82,7 +82,7 @@ Each skill folder shall include: .agents/skills// ├── SKILL.md └── evals/ - └── evals.json + └── evals.yaml ``` Evaluation run results, when generated, shall be stored under `evals/iterations/iteration-N/` according to the `create-skill` workflow. A `references/` folder may be added only when it contains useful supporting files, such as examples, detailed procedures, or compatibility notes that would make `SKILL.md` too long or less readable. Do not create placeholder `references/` folders. @@ -259,7 +259,7 @@ Each skill body shall define purpose, scope, trigger cases, non-trigger cases, w **Acceptance criteria:** -- [ ] Each skill has `evals/evals.json` generated through `.agents/skills/create-skill/`. +- [ ] Each skill has `evals/evals.yaml` generated through `.agents/skills/create-skill/`. - [ ] Each skill has 8-10 realistic eval prompts where possible, and never fewer than the PRD minimum of 7. - [ ] Each eval set includes at least 3 true-positive prompts. - [ ] Each eval set includes at least 2 false-positive prompts where nearby language should route elsewhere or not trigger. @@ -318,7 +318,7 @@ flowchart TD | Component | Responsibility | | --- | --- | | `.agents/skills//SKILL.md` | Runtime instructions, metadata, trigger guidance, exclusions, workflow, and output expectations | -| `.agents/skills//evals/evals.json` | Representative trigger and non-trigger prompts generated through `create-skill` | +| `.agents/skills//evals/evals.yaml` | Representative trigger and non-trigger prompts generated through `create-skill` | | `.agents/skills//evals/iterations/iteration-N/` | Reproducible eval run outputs, grading, and benchmark artifacts when generated | | `.agents/memory/` | Target memory location for `remember` behavior | | `.agents/skills/create-skill/` | Development-time eval generation, validation, and packaging support | @@ -381,7 +381,7 @@ Review every `SKILL.md` for frontmatter completeness, trigger specificity, exclu ### 8.2 Trigger Eval Review -For each skill, generate and review `evals/evals.json` through `.agents/skills/create-skill/`. Use 8-10 realistic prompts where possible, and never fewer than the PRD minimum of 7: +For each skill, generate and review `evals/evals.yaml` through `.agents/skills/create-skill/`. Use 8-10 realistic prompts where possible, and never fewer than the PRD minimum of 7: ```text 3 true-positive prompts diff --git a/docs/2026-05-20-general-agent-skills/user-stories/US-001-author-standalone-general-skills.md b/docs/2026-05-20-general-agent-skills/user-stories/US-001-author-standalone-general-skills.md index 8a86da5..0b56d51 100644 --- a/docs/2026-05-20-general-agent-skills/user-stories/US-001-author-standalone-general-skills.md +++ b/docs/2026-05-20-general-agent-skills/user-stories/US-001-author-standalone-general-skills.md @@ -38,7 +38,7 @@ Source documents: .agents/skills// ├── SKILL.md └── evals/ - └── evals.json + └── evals.yaml ``` --- diff --git a/docs/2026-05-20-general-agent-skills/user-stories/US-002-generate-skill-evals.md b/docs/2026-05-20-general-agent-skills/user-stories/US-002-generate-skill-evals.md index ec0cade..645f523 100644 --- a/docs/2026-05-20-general-agent-skills/user-stories/US-002-generate-skill-evals.md +++ b/docs/2026-05-20-general-agent-skills/user-stories/US-002-generate-skill-evals.md @@ -21,7 +21,7 @@ Source documents: ## 🔍 2. Strict Constraints & Scope Boundaries - **In-Scope:** - - Generate `.agents/skills//evals/evals.json` for each new skill. + - Generate `.agents/skills//evals/evals.yaml` for each new skill. - Include 8-10 realistic prompts where possible, never fewer than 7. - Include at least 3 true-positive prompts, 2 false-positive prompts, and 2 non-trigger prompts per skill. - Include expected trigger behavior and expected output behavior for each eval. @@ -32,7 +32,7 @@ Source documents: - Do not create eval iteration output folders unless eval runs are actually executed. - **Data Models & Schemas:** - Use the eval schema expected by `.agents/skills/create-skill/`. - - Store eval cases at `.agents/skills//evals/evals.json`. + - Store eval cases at `.agents/skills//evals/evals.yaml`. - Store run outputs only under `.agents/skills//evals/iterations/iteration-N/` if runs are performed. --- @@ -45,7 +45,7 @@ Source documents: Scenario: Generate evals for each skill Given the nine new general skills exist When the agent generates evals through create-skill conventions - Then each new skill has evals/evals.json + Then each new skill has evals/evals.yaml And each eval file contains 8-10 realistic prompts where possible And no eval file contains fewer than 7 prompts @@ -69,15 +69,15 @@ Scenario: Preserve eval folder discipline *Note to Agent: You are restricted to modifying or analyzing the following components.* - **Primary Target Files:** - 1. `.agents/skills/ask/evals/evals.json` -> Trigger and output evals. - 2. `.agents/skills/brainstorm/evals/evals.json` -> Trigger and output evals. - 3. `.agents/skills/classify/evals/evals.json` -> Trigger and output evals. - 4. `.agents/skills/plan/evals/evals.json` -> Trigger and output evals. - 5. `.agents/skills/investigate/evals/evals.json` -> Trigger and output evals. - 6. `.agents/skills/choose/evals/evals.json` -> Trigger and output evals. - 7. `.agents/skills/manage/evals/evals.json` -> Trigger and output evals. - 8. `.agents/skills/remember/evals/evals.json` -> Trigger and output evals. - 9. `.agents/skills/adapt/evals/evals.json` -> Trigger and output evals. + 1. `.agents/skills/ask/evals/evals.yaml` -> Trigger and output evals. + 2. `.agents/skills/brainstorm/evals/evals.yaml` -> Trigger and output evals. + 3. `.agents/skills/classify/evals/evals.yaml` -> Trigger and output evals. + 4. `.agents/skills/plan/evals/evals.yaml` -> Trigger and output evals. + 5. `.agents/skills/investigate/evals/evals.yaml` -> Trigger and output evals. + 6. `.agents/skills/choose/evals/evals.yaml` -> Trigger and output evals. + 7. `.agents/skills/manage/evals/evals.yaml` -> Trigger and output evals. + 8. `.agents/skills/remember/evals/evals.yaml` -> Trigger and output evals. + 9. `.agents/skills/adapt/evals/evals.yaml` -> Trigger and output evals. - **Shared Dependencies/Imports:** - Follow `.agents/skills/create-skill/references/evaluation.md`. - Use boundary distinctions from [SPEC.md](../SPEC.md). @@ -92,7 +92,7 @@ Scenario: Preserve eval folder discipline 2. **Generate Eval Cases:** Create prompt-level evals for each new skill. 3. **Check Counts:** Verify each eval file meets the 8-10 target where possible and never drops below 7. 4. **Check Boundary Coverage:** Confirm likely overlaps are represented across the relevant eval files. -5. **Validate JSON:** Ensure every `evals.json` file is valid JSON and follows the local create-skill expectations. +5. **Validate YAML:** Ensure every `evals.yaml` file is valid YAML and follows the local create-skill expectations. --- @@ -100,7 +100,7 @@ Scenario: Preserve eval folder discipline *Note to Agent: You must run validation scripts to check these boxes before marked as complete.* -- [ ] **Compilation:** All new `evals/evals.json` files parse as valid JSON. +- [ ] **Compilation:** All new `evals/evals.yaml` files parse as valid YAML. - [ ] **Test Coverage:** Every new skill has enough prompt coverage for trigger review. - [ ] **No Regression:** No existing eval files are removed or moved. - [ ] **Idempotency:** Re-running eval generation updates intended files without duplicating cases or creating empty iteration folders. diff --git a/docs/2026-05-20-general-agent-skills/user-stories/US-003-validate-skills-and-update-index.md b/docs/2026-05-20-general-agent-skills/user-stories/US-003-validate-skills-and-update-index.md index 45d16ea..45bbe1a 100644 --- a/docs/2026-05-20-general-agent-skills/user-stories/US-003-validate-skills-and-update-index.md +++ b/docs/2026-05-20-general-agent-skills/user-stories/US-003-validate-skills-and-update-index.md @@ -42,7 +42,7 @@ Source documents: ```gherkin Scenario: Validate each new skill - Given a new skill folder exists with SKILL.md and evals/evals.json + Given a new skill folder exists with SKILL.md and evals/evals.yaml When create-skill validation is run against the skill folder Then validation passes And any failures are fixed or documented with a clear reason @@ -68,7 +68,7 @@ Scenario: Prevent runtime coupling - **Primary Target Files:** 1. `.agents/skills//SKILL.md` -> Validation target. - 2. `.agents/skills//evals/evals.json` -> Validation target. + 2. `.agents/skills//evals/evals.yaml` -> Validation target. 3. `.agents/skills/README.md` -> Skill index, if present. - **Shared Dependencies/Imports:** - Use `.agents/skills/create-skill/scripts/quick_validate.py` when available. From 4ef23306e03493299ed9a3d8f0a66a8d89f18699 Mon Sep 17 00:00:00 2001 From: Oleg Shulyakov Date: Mon, 25 May 2026 16:25:48 +0300 Subject: [PATCH 2/3] fix --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 238e954..33504fe 100644 --- a/.gitignore +++ b/.gitignore @@ -497,4 +497,5 @@ $RECYCLE.BIN/ # Evaluation output quality files .agents/**/evals/* +!.agents/**/evals/evals.json !.agents/**/evals/evals.yaml From 530b84379cc445806da1f7e97320770ceb8a6011 Mon Sep 17 00:00:00 2001 From: Oleg Shulyakov Date: Mon, 25 May 2026 16:27:33 +0300 Subject: [PATCH 3/3] chore: rename quick_validate --- .agents/skills/README.md | 2 +- .agents/skills/create-skill/SKILL.md | 4 ++-- .agents/skills/create-skill/references/authoring.md | 2 +- .../scripts/{quick_validate.py => validate.py} | 2 +- docs/2026-05-02-team-roles-as-skills/PRD.md | 6 +++--- docs/2026-05-02-team-roles-as-skills/SPEC.md | 12 ++++++------ .../US-003-validate-skills-and-update-index.md | 2 +- docs/2026-05-23-design-principles/SPEC.md | 2 +- 8 files changed, 16 insertions(+), 16 deletions(-) rename .agents/skills/create-skill/scripts/{quick_validate.py => validate.py} (99%) diff --git a/.agents/skills/README.md b/.agents/skills/README.md index d31d0ab..e5d3cd1 100644 --- a/.agents/skills/README.md +++ b/.agents/skills/README.md @@ -55,7 +55,7 @@ python3 -m scripts.package_skill ../code-database /tmp/skills-dist Use this validation command when changing an existing skill: ```bash -python3 .agents/skills/create-skill/scripts/quick_validate.py .agents/skills/code-database +python3 .agents/skills/create-skill/scripts/validate.py .agents/skills/code-database ``` The key rule is simple: keep `SKILL.md` and any files it references together. If a skill says to read `references/postgres.md`, that file must remain available relative to the skill folder. Tiny rule, large consequences. Filesystems enjoy pettiness. diff --git a/.agents/skills/create-skill/SKILL.md b/.agents/skills/create-skill/SKILL.md index c1e4493..f3bef4a 100644 --- a/.agents/skills/create-skill/SKILL.md +++ b/.agents/skills/create-skill/SKILL.md @@ -37,7 +37,7 @@ Create new skills, review and improve existing skills, evaluate outputs, optimiz 2. **Clarify activation and behavior.** Identify what the skill should do, which user phrases or contexts should trigger it, what output it should produce, and whether objective evals are useful. 3. **Write or revise the skill.** Name new skills using the `-[-]` convention or a concise `` format (e.g., `code-tests`, `ask`). Follow `references/authoring.md` for metadata, trigger descriptions, `SKILL.md` body format, reference file format, section delimiters, scan anchors, examples, helper scripts, portability, and validation. Always bump `metadata.version` using semantic versioning upon any material change to a skill's files. -4. **Test behavior.** Run this skill's `scripts/quick_validate.py` against the target skill when available. For router skills, confirm every `references/*.md` file has 8-10 evals mapped by `reference`; for objectively testable skills, run skill-enabled outputs against a meaningful baseline. +4. **Test behavior.** Run this skill's `scripts/validate.py` against the target skill when available. For router skills, confirm every `references/*.md` file has 8-10 evals mapped by `reference`; for objectively testable skills, run skill-enabled outputs against a meaningful baseline. 5. **Show evidence.** Share validation output, eval results, benchmark summaries, and relevant diffs before making another revision. 6. **Iterate deliberately.** Continue until feedback is resolved or further changes stop improving behavior. 7. **Package last.** Package the final skill only after the user is satisfied with behavior and trigger accuracy. @@ -65,7 +65,7 @@ Create new skills, review and improve existing skills, evaluate outputs, optimiz ## Bundled Resources - **Trigger optimization**: `scripts/run_eval.py`, `scripts/run_loop.py`, and `scripts/improve_description.py` -- **Validation**: `scripts/quick_validate.py` +- **Validation**: `scripts/validate.py` - **Benchmark summaries**: `scripts/aggregate_benchmark.py` - **Packaging**: `scripts/package_skill.py` - **Human review UI**: `eval-viewer/generate_review.py` diff --git a/.agents/skills/create-skill/references/authoring.md b/.agents/skills/create-skill/references/authoring.md index edc52ee..14f1171 100644 --- a/.agents/skills/create-skill/references/authoring.md +++ b/.agents/skills/create-skill/references/authoring.md @@ -198,7 +198,7 @@ For objectively testable skills, include assertions, scripts, schemas, fixtures, After editing, run: ```bash -python /scripts/quick_validate.py +python /scripts/validate.py ``` Treat style failures as authoring bugs, not optional polish. diff --git a/.agents/skills/create-skill/scripts/quick_validate.py b/.agents/skills/create-skill/scripts/validate.py similarity index 99% rename from .agents/skills/create-skill/scripts/quick_validate.py rename to .agents/skills/create-skill/scripts/validate.py index 9448026..0710a9f 100755 --- a/.agents/skills/create-skill/scripts/quick_validate.py +++ b/.agents/skills/create-skill/scripts/validate.py @@ -330,7 +330,7 @@ def validate_skill(skill_path): if __name__ == "__main__": if len(sys.argv) != 2: - print("Usage: python quick_validate.py ") + print("Usage: python validate.py ") sys.exit(1) valid, message = validate_skill(sys.argv[1]) diff --git a/docs/2026-05-02-team-roles-as-skills/PRD.md b/docs/2026-05-02-team-roles-as-skills/PRD.md index 3e82198..0e39c02 100644 --- a/docs/2026-05-02-team-roles-as-skills/PRD.md +++ b/docs/2026-05-02-team-roles-as-skills/PRD.md @@ -41,7 +41,7 @@ The initiative targets teams and individual practitioners who want role-aware AI | G-3 | Reduce repeated role-specific prompting. | Users can invoke each completed skill by artifact or task intent without restating its output structure or role conventions. | | G-4 | Produce concrete, reusable artifacts instead of generic advice. | Every completed skill description names the artifact it produces and when it should trigger. | | G-5 | Keep complex domains usable without exploding the skill count. | Multi-variant router skills select the correct reference from context or ask only when materially ambiguous. | -| G-6 | Maintain quality through repeatable evaluation. | Each release-ready skill passes `quick_validate.py`, has required eval coverage, and clears an 85% aggregate expectation pass rate with no failed critical expectations. | +| G-6 | Maintain quality through repeatable evaluation. | Each release-ready skill passes `validate.py`, has required eval coverage, and clears an 85% aggregate expectation pass rate with no failed critical expectations. | | G-7 | Make completed skills distributable from local artifacts. | Each release-ready skill packages successfully as a `.skill` file with bundled instructions, references, scripts, and assets as applicable. | --- @@ -145,7 +145,7 @@ For multi-variant skills, the expected interaction is context-first routing. For | R-3 | Eval requirements slow down early skill creation. | MEDIUM | Build P1 skills first and treat evals as part of the definition of done, not cleanup. | OPEN | | R-4 | Team-specific conventions may not fit the base library. | MEDIUM | Keep base skills generic, then support local install-time or repository-level guidance. | OPEN | | R-5 | Documentation can drift from the actual skill folders. | HIGH | Update PRD, SPEC, TASKS, and memory notes in the same change when catalog decisions change. | OPEN | -| R-6 | Some implemented skills predate the current `create-skill` validation rules. | HIGH | Run `quick_validate.py` per skill, then fix missing bold scan anchors, routed eval `reference` fields, and reference-section principles before release readiness. | OPEN | +| R-6 | Some implemented skills predate the current `create-skill` validation rules. | HIGH | Run `validate.py` per skill, then fix missing bold scan anchors, routed eval `reference` fields, and reference-section principles before release readiness. | OPEN | --- @@ -165,7 +165,7 @@ For multi-variant skills, the expected interaction is context-first routing. For | --- | --- | --- | --- | --- | | DEC-1 | Use milestone gates instead of a fixed calendar date for the first complete catalog release. | The release is ready when all 55 cataloged skills are implemented, evaluated, documented, and packageable; 10 catalog skills are implemented as of 2026-05-23. | Oleg Shulyakov [assumed] | 2026-05-23 | | DEC-2 | Use the `create-skill` eval bar: 8-10 realistic eval prompts for focused skills and 8-10 prompts per routed reference for router skills. | This keeps the PRD aligned with the maintained authoring workflow while allowing specialized skills to add cases for variant coverage, boundary-trigger testing, or safety-sensitive behavior. | Skill maintainers [assumed] | 2026-05-23 | -| DEC-3 | Package release-ready skills from `.agents/skills/create-skill` with `python3 -m scripts.package_skill ../ /tmp/skills-dist`. | Release readiness requires `quick_validate.py` to pass, evals to be present in source and pass at least an 85% aggregate expectation pass rate with no failed critical expectations, router evals to include `reference` fields, references to be useful, and no security or packaging blockers to remain. | Skill maintainers [assumed] | 2026-05-23 | +| DEC-3 | Package release-ready skills from `.agents/skills/create-skill` with `python3 -m scripts.package_skill ../ /tmp/skills-dist`. | Release readiness requires `validate.py` to pass, evals to be present in source and pass at least an 85% aggregate expectation pass rate with no failed critical expectations, router evals to include `reference` fields, references to be useful, and no security or packaging blockers to remain. | Skill maintainers [assumed] | 2026-05-23 | | DEC-4 | Treat organization-level convention packs as a separate follow-up. | This initiative ships the base local skill library first; organization convention packs should layer on later once the base format and release checks are stable. | Oleg Shulyakov [assumed] | 2026-05-23 | --- diff --git a/docs/2026-05-02-team-roles-as-skills/SPEC.md b/docs/2026-05-02-team-roles-as-skills/SPEC.md index 71fc4cc..65d95c4 100644 --- a/docs/2026-05-02-team-roles-as-skills/SPEC.md +++ b/docs/2026-05-02-team-roles-as-skills/SPEC.md @@ -61,7 +61,7 @@ The library must remain local-first. It is not a plugin marketplace, a project m | Keep discovery predictable | Skill names follow verb-first convention. | 100% compliance | | Reduce repeated prompting | Completed skills encode trigger, output, and quality expectations. | Every completed skill has specific frontmatter and instructions | | Keep complex domains usable | Router skills select variants from context. | Ask at most one clarifying question when materially ambiguous | -| Maintain quality | Skills pass validation and eval thresholds. | `quick_validate.py` pass plus required eval coverage | +| Maintain quality | Skills pass validation and eval thresholds. | `validate.py` pass plus required eval coverage | | Support local distribution | Release-ready skills package as `.skill` files. | Successful local package build | ### 1.6 Non-Goals @@ -100,7 +100,7 @@ Runtime plugin hosting, remote skill fetching, marketplace behavior, live Jira/G 1. Maintainer confirms trigger scope, output artifact, routing needs, and eval expectations. 2. Maintainer creates or updates `SKILL.md`, references, and evals. -3. Maintainer validates the skill with `quick_validate.py`. +3. Maintainer validates the skill with `validate.py`. 4. Maintainer runs or reviews eval coverage where available. 5. Maintainer updates PRD, SPEC, TASKS, and memory notes when catalog behavior changes. 6. Maintainer packages the skill once release-ready. @@ -179,7 +179,7 @@ Runtime plugin hosting, remote skill fetching, marketplace behavior, live Jira/G **Acceptance criteria:** -- `python3 .agents/skills/create-skill/scripts/quick_validate.py .agents/skills/` passes. +- `python3 .agents/skills/create-skill/scripts/validate.py .agents/skills/` passes. - Focused skills have 8-10 realistic eval prompts. - Router skills have 8-10 eval prompts per routed reference before release readiness. - Eval assertions reach at least 85% aggregate pass rate with no failed critical expectations. @@ -398,7 +398,7 @@ Router skills for this release are `audit-security`, `code-frontend`, `code-back Each release-ready skill must pass: ```bash -python3 .agents/skills/create-skill/scripts/quick_validate.py .agents/skills/ +python3 .agents/skills/create-skill/scripts/validate.py .agents/skills/ ``` Validation failures block packaging. Common blockers include invalid frontmatter, missing required sections, weak scan anchors, malformed evals, missing router `reference` fields, and unused placeholder folders. @@ -440,7 +440,7 @@ Each skill should be built with `create-skill` using this sequence: 2. Draft `SKILL.md` with frontmatter, concise workflow instructions, clear section headings, and scan anchors. 3. Move reusable detail into `references/` only when needed. 4. Add `evals/evals.yaml` with required focused or routed coverage. -5. Run `quick_validate.py`. +5. Run `validate.py`. 6. Run or review eval iterations when behavior needs evidence. 7. Review outputs qualitatively and assertions quantitatively where objective checks apply. 8. Iterate until feedback is resolved, improvements flatten, or the user accepts behavior. @@ -500,7 +500,7 @@ python3 -m scripts.package_skill ../ /tmp/skills-dist | # | Question | Answer | Owner | Status | | --- | --- | --- | --- | --- | -| 1 | What assertion pass threshold is required for eval release readiness? | Release-ready skills must reach at least 85% aggregate expectation pass rate, with no failed critical expectations, after `quick_validate.py` passes and required eval coverage exists. Human review may require fixes above that threshold when failures affect the skill's core artifact. | Skill maintainers [assumed] | Closed | +| 1 | What assertion pass threshold is required for eval release readiness? | Release-ready skills must reach at least 85% aggregate expectation pass rate, with no failed critical expectations, after `validate.py` passes and required eval coverage exists. Human review may require fixes above that threshold when failures affect the skill's core artifact. | Skill maintainers [assumed] | Closed | | 2 | What local install command or workflow confirms packaged `.skill` artifacts are installable? | No separate local install command exists in the repo today. Current release verification is package-and-inspect: run `python3 -m scripts.package_skill ../ /tmp/skills-dist`, confirm exit code 0, then list the `.skill` archive and verify `/SKILL.md` plus required references, scripts, and assets are present. | Oleg Shulyakov [assumed] | Closed | ## 14. Appendix diff --git a/docs/2026-05-20-general-agent-skills/user-stories/US-003-validate-skills-and-update-index.md b/docs/2026-05-20-general-agent-skills/user-stories/US-003-validate-skills-and-update-index.md index 45bbe1a..dc16f06 100644 --- a/docs/2026-05-20-general-agent-skills/user-stories/US-003-validate-skills-and-update-index.md +++ b/docs/2026-05-20-general-agent-skills/user-stories/US-003-validate-skills-and-update-index.md @@ -71,7 +71,7 @@ Scenario: Prevent runtime coupling 2. `.agents/skills//evals/evals.yaml` -> Validation target. 3. `.agents/skills/README.md` -> Skill index, if present. - **Shared Dependencies/Imports:** - - Use `.agents/skills/create-skill/scripts/quick_validate.py` when available. + - Use `.agents/skills/create-skill/scripts/validate.py` when available. - Follow `.agents/skills/create-skill/references/authoring.md`. --- diff --git a/docs/2026-05-23-design-principles/SPEC.md b/docs/2026-05-23-design-principles/SPEC.md index 0bb23fa..8d77489 100644 --- a/docs/2026-05-23-design-principles/SPEC.md +++ b/docs/2026-05-23-design-principles/SPEC.md @@ -279,7 +279,7 @@ No production observability changes are required. Development-time validation sh | Level | Scope | Tools | Coverage Target | | --- | --- | --- | --- | | Markdown lint | Changed Markdown files | `markdownlint` or `markdownlint-cli2` | No lint errors, unless documented | -| Skill validation | Changed skills | `.agents/skills/create-skill/scripts/quick_validate.py` | Pass for every changed skill | +| Skill validation | Changed skills | `.agents/skills/create-skill/scripts/validate.py` | Pass for every changed skill | | Eval prompts | Principle-specific behavior | Existing skill eval workflow | Added only where behavior risk justifies it | | Manual review | Runtime wording | Human review | No duplicated full principle catalog or vague principle boilerplate |