From 47c02a81c0db14374f3431cf412ca628b8b1307c Mon Sep 17 00:00:00 2001 From: "Carlos D. Escobar-Valbuena" Date: Fri, 5 Jun 2026 14:44:37 -0500 Subject: [PATCH] feat(0.26.0): skills audit --require-tests gate (BRO-1411 slice 2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Skillify step 3 (unit tests on deterministic code), bstack-native. Adds a 6th audit report 'Untested deterministic code' (correctness, vs the 5 hygiene reports) + --require-tests flag that gates CI (exit 1 if any skill ships scripts/*.{py,sh,mjs,js,ts} with no test file). Markdown-only skills exempt. First real run over ~/broomva/skills: 19 skills ship untested deterministic code — the bstack analog of GBrain's 6/40 dark-skills finding. +4 hermetic tests (14/14). VERSION 0.25.0 -> 0.26.0. Co-Authored-By: Claude Opus 4.8 (1M context) --- CHANGELOG.md | 18 +++++++++ VERSION | 2 +- scripts/skill-audit.py | 84 +++++++++++++++++++++++++++++++++++++-- tests/skill-audit.test.sh | 34 ++++++++++++++++ 4 files changed, 134 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b610689..4ef626c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,23 @@ # Changelog +## 0.26.0 — 2026-06-05 + +### feat: `bstack skills audit --require-tests` — skill-script test gate (BRO-1411 slice 2) + +Adds a sixth audit report and a CI-gateable correctness check. Origin: `/checkit` on Garry Tan's "skillify" essay surfaced that bstack's `skills audit` covers *hygiene* (budget / duplicate / reachability) but never *correctness* of the skill layer. This is skillify step 3 ("unit tests on the deterministic code"), built bstack-native — the correctness counterpart to the existing five hygiene reports. + +### Added + +- **`scripts/skill-audit.py` report 6 — "Untested deterministic code".** Flags any skill that ships deterministic code (`scripts/*.{py,sh,mjs,js,ts}` or root-level) but no test file (`test_*.py`, `*_test.py`, `*.test.*`, `test_*.sh`). Markdown-only skills are exempt (nothing to test); test files themselves don't count as code. Always shown (informational); `--json` gains an `untested` array. +- **`--require-tests` flag** — escalates the report to a hard gate: exit 1 if any skill ships untested code. Default (no flag) stays exit 0, so the report informs without breaking existing callers; CI opts into enforcement. +- **`tests/skill-audit.test.sh`** — 4 new hermetic cases (T10–T13): untested detection (code-no-tests flagged, code+tests and md-only exempt), gate exit-1 under `--require-tests`, informational exit-0 without it, human-report section presence. 14/14 pass. + +### Notes + +- First real run over `~/broomva/skills`: **19 skills ship deterministic code with no tests** (the bstack analog of GBrain's "6/40 dark skills" finding). Informational today; backlog candidates for test backfill. +- Primitive count unchanged (**20**). Widens the `bstack skills audit` surface — not a new P-row. The `bstack-engine` "Skill-QA discipline" ledger row (workspace KG) tracks promotion eligibility. +- `VERSION` 0.25.0 → 0.26.0. + ## 0.25.0 — 2026-06-05 ### feat: doctor advisory + repair backfill for the Development Philosophy section (BRO-1409) diff --git a/VERSION b/VERSION index d21d277..4e8f395 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.25.0 +0.26.0 diff --git a/scripts/skill-audit.py b/scripts/skill-audit.py index a486f58..dc15715 100755 --- a/scripts/skill-audit.py +++ b/scripts/skill-audit.py @@ -13,13 +13,15 @@ - usage-trace scanning of Claude Code logs (~/.claude/projects/**/*.jsonl) rather than Codex's ~/.codex/history.jsonl -Five reports: +Six reports (1-5 are hygiene; 6 is correctness — skillify step 3, BRO-1411): 1. Budget — total description token cost vs ceiling (default 2% of 1M) 2. Duplicates — same skill name across >1 distinct realpath 3. Registry — coherence between companion-skills.yaml and installed roots (registered-but-missing, installed-but-unregistered) 4. Unused — no invocation trace in recent session logs (--months window) 5. Roots — skill count per root + 6. Untested — ships deterministic code (scripts/*.{py,sh,mjs,js,ts}) but no + tests; informational by default, a hard gate under --require-tests Env overrides (test fixtures): BSTACK_DIR bstack root (for default companion-skills.yaml) @@ -152,6 +154,63 @@ def scan_usage(skill_names: list[str], log_glob: str, months: int) -> set[str]: return used +CODE_EXTS = {".py", ".sh", ".mjs", ".js", ".ts"} + + +def _is_test_file(name: str) -> bool: + return ( + name.startswith("test_") + or name.endswith("_test.py") + or name.endswith("_test.sh") + or ".test." in name + ) + + +def _skill_code_files(skill_dir: Path) -> list[str]: + """Deterministic code files a skill ships (scripts/ + skill root, one level). + + Test files are excluded — a skill whose only code IS its tests has nothing + left to test. Markdown-only skills return [] and are exempt from the gate. + """ + found: list[str] = [] + for sub in ("scripts", ""): + d = skill_dir / sub if sub else skill_dir + if not d.is_dir(): + continue + for f in sorted(d.iterdir()): + if f.is_file() and f.suffix in CODE_EXTS and not _is_test_file(f.name): + found.append(str(f.relative_to(skill_dir))) + return found + + +def _skill_has_tests(skill_dir: Path) -> bool: + """True if the skill ships any test file (tests/ or scripts/ or root, one level).""" + for sub in ("tests", "scripts", ""): + d = skill_dir / sub if sub else skill_dir + if not d.is_dir(): + continue + for f in d.iterdir(): + if f.is_file() and _is_test_file(f.name): + return True + return False + + +def detect_untested(skills: list[dict]) -> list[dict]: + """Skills shipping deterministic code but no tests — skillify step 3 (BRO-1411). + + The correctness counterpart to the hygiene reports: `audit` already covers + budget/duplicate/reachability; this covers "the script the skill runs is + actually tested". Markdown-only skills are exempt (no deterministic code). + """ + out: list[dict] = [] + for s in skills: + skill_dir = Path(s["path"]).parent + code = _skill_code_files(skill_dir) + if code and not _skill_has_tests(skill_dir): + out.append({"name": s["name"], "dir": str(skill_dir), "code_files": code}) + return sorted(out, key=lambda x: x["name"]) + + def main() -> int: ap = argparse.ArgumentParser(prog="bstack skills audit", description="Skill registry audit.") ap.add_argument("--roots", action="append", default=[], help="Additional skill root (repeatable).") @@ -159,6 +218,8 @@ def main() -> int: ap.add_argument("--chars-per-token", type=int, default=4, help="Token-cost divisor (default 4).") ap.add_argument("--months", type=int, default=3, help="Usage-trace window for unused detection (default 3).") ap.add_argument("--no-logs", action="store_true", help="Skip usage-trace scanning.") + ap.add_argument("--require-tests", action="store_true", + help="Gate: exit 1 if any skill ships deterministic code without tests (skillify step 3, BRO-1411).") ap.add_argument("--json", action="store_true", help="Machine-readable output.") args = ap.parse_args() @@ -204,6 +265,10 @@ def main() -> int: for s in skills: root_counts[s["root"]] = root_counts.get(s["root"], 0) + 1 + # 6. Untested deterministic code (skillify step 3 — correctness, not hygiene) + untested = detect_untested(skills) + gate_failed = bool(args.require_tests and untested) + if args.json: print(json.dumps({ "total_skills": len(skills), @@ -213,8 +278,10 @@ def main() -> int: "registry": {"registered_missing": registered_missing, "installed_unregistered": installed_unregistered}, "unused": unused, "roots": root_counts, + "untested": untested, + "require_tests": bool(args.require_tests), }, indent=2)) - return 0 + return 1 if gate_failed else 0 # Human report print("# Skill Audit Report\n") @@ -243,10 +310,21 @@ def main() -> int: print(f"## Unused (no trace in last {args.months}mo) [{len(unused)}]") print(f" {', '.join(unused) or '(none — all skills show recent usage)'}") print() + print(f"## Untested deterministic code [{len(untested)}]") + if untested: + for u in untested: + print(f" {u['name']}: {', '.join(u['code_files'])}") + if args.require_tests: + print(f" ⚠ {len(untested)} skill(s) ship code without tests — --require-tests gate FAILED") + else: + print(" (informational — pass --require-tests to gate CI on this)") + else: + print(" (none — every skill with deterministic code ships tests)") + print() print("## Roots") for r, c in sorted(root_counts.items()): print(f" {c:3d} {r}") - return 0 + return 1 if gate_failed else 0 if __name__ == "__main__": diff --git a/tests/skill-audit.test.sh b/tests/skill-audit.test.sh index e661c7d..b8d2329 100755 --- a/tests/skill-audit.test.sh +++ b/tests/skill-audit.test.sh @@ -132,6 +132,40 @@ if echo "$out" | grep -q '## Budget' && echo "$out" | grep -q '## Duplicates' \ && echo "$out" | grep -q '## Registry coherence' && echo "$out" | grep -q '## Unused' \ && echo "$out" | grep -q '## Roots'; then ap "$t"; else af "$t"; fi +# ── --require-tests gate (skillify step 3, BRO-1411) — separate hermetic root ── +FX2="$(mktemp -d)"; RT_ROOT="$FX2/skills"; mkdir -p "$RT_ROOT" +# md-only skill → exempt (no deterministic code) +make_skill "$RT_ROOT" docs docs "Markdown only, no code." +# code, no tests → must be flagged untested +make_skill "$RT_ROOT" coded coded "Script but no tests." +mkdir -p "$RT_ROOT/coded/scripts"; echo 'print(1)' > "$RT_ROOT/coded/scripts/run.py" +# code + tests → must NOT be flagged +make_skill "$RT_ROOT" tested tested "Script and test." +mkdir -p "$RT_ROOT/tested/scripts" "$RT_ROOT/tested/tests" +echo 'print(1)' > "$RT_ROOT/tested/scripts/run.py" +echo 'def test_x(): assert True' > "$RT_ROOT/tested/tests/test_run.py" + +rt_audit() { BSTACK_AUDIT_ROOTS="$RT_ROOT" BSTACK_DIR="$FAKE_BSTACK" python3 "$AUDIT_PY" "$@"; } + +# T10: untested detection — only 'coded' flagged; 'tested' + md-only 'docs' exempt +t="untested detection (coded flagged; tested + md-only exempt)" +if rt_audit --json --no-logs 2>/dev/null | python3 -c "import json,sys; d=json.load(sys.stdin); u={x['name'] for x in d['untested']}; assert u=={'coded'}, u" 2>/dev/null; then ap "$t"; else af "$t"; fi + +# T11: --require-tests gate exits 1 when an untested skill exists +t="--require-tests exits 1 on untested skill" +rt_audit --no-logs --require-tests >/dev/null 2>&1; rc=$? +if [ "$rc" -eq 1 ]; then ap "$t"; else af "$t" "rc=$rc (expected 1)"; fi + +# T12: without --require-tests, untested report is informational (exit 0) +t="untested report informational without --require-tests (exit 0)" +rt_audit --no-logs >/dev/null 2>&1; rc=$? +if [ "$rc" -eq 0 ]; then ap "$t"; else af "$t" "rc=$rc (expected 0)"; fi + +# T13: human report includes the Untested section +t="human report includes '## Untested deterministic code'" +if rt_audit --no-logs 2>/dev/null | grep -q '## Untested deterministic code'; then ap "$t"; else af "$t"; fi + +rm -rf "$FX2" rm -rf "$FX" echo "" echo "── results: $PASS passed, $FAIL failed ────────────────────────────"