From 47c02a81c0db14374f3431cf412ca628b8b1307c Mon Sep 17 00:00:00 2001
From: "Carlos D. Escobar-Valbuena" <devteam@getstimulus.ai>
Date: Fri, 5 Jun 2026 14:44:37 -0500
Subject: [PATCH] feat(0.26.0): skills audit --require-tests gate (BRO-1411
 slice 2)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Skillify step 3 (unit tests on deterministic code), bstack-native. Adds a 6th
audit report 'Untested deterministic code' (correctness, vs the 5 hygiene
reports) + --require-tests flag that gates CI (exit 1 if any skill ships
scripts/*.{py,sh,mjs,js,ts} with no test file). Markdown-only skills exempt.

First real run over ~/broomva/skills: 19 skills ship untested deterministic
code — the bstack analog of GBrain's 6/40 dark-skills finding.

+4 hermetic tests (14/14). VERSION 0.25.0 -> 0.26.0.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 CHANGELOG.md              | 18 +++++++++
 VERSION                   |  2 +-
 scripts/skill-audit.py    | 84 +++++++++++++++++++++++++++++++++++++--
 tests/skill-audit.test.sh | 34 ++++++++++++++++
 4 files changed, 134 insertions(+), 4 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index b610689..4ef626c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,23 @@
 # Changelog
 
+## 0.26.0 — 2026-06-05
+
+### feat: `bstack skills audit --require-tests` — skill-script test gate (BRO-1411 slice 2)
+
+Adds a sixth audit report and a CI-gateable correctness check. Origin: `/checkit` on Garry Tan's "skillify" essay surfaced that bstack's `skills audit` covers *hygiene* (budget / duplicate / reachability) but never *correctness* of the skill layer. This is skillify step 3 ("unit tests on the deterministic code"), built bstack-native — the correctness counterpart to the existing five hygiene reports.
+
+### Added
+
+- **`scripts/skill-audit.py` report 6 — "Untested deterministic code".** Flags any skill that ships deterministic code (`scripts/*.{py,sh,mjs,js,ts}` or root-level) but no test file (`test_*.py`, `*_test.py`, `*.test.*`, `test_*.sh`). Markdown-only skills are exempt (nothing to test); test files themselves don't count as code. Always shown (informational); `--json` gains an `untested` array.
+- **`--require-tests` flag** — escalates the report to a hard gate: exit 1 if any skill ships untested code. Default (no flag) stays exit 0, so the report informs without breaking existing callers; CI opts into enforcement.
+- **`tests/skill-audit.test.sh`** — 4 new hermetic cases (T10–T13): untested detection (code-no-tests flagged, code+tests and md-only exempt), gate exit-1 under `--require-tests`, informational exit-0 without it, human-report section presence. 14/14 pass.
+
+### Notes
+
+- First real run over `~/broomva/skills`: **19 skills ship deterministic code with no tests** (the bstack analog of GBrain's "6/40 dark skills" finding). Informational today; backlog candidates for test backfill.
+- Primitive count unchanged (**20**). Widens the `bstack skills audit` surface — not a new P-row. The `bstack-engine` "Skill-QA discipline" ledger row (workspace KG) tracks promotion eligibility.
+- `VERSION` 0.25.0 → 0.26.0.
+
 ## 0.25.0 — 2026-06-05
 
 ### feat: doctor advisory + repair backfill for the Development Philosophy section (BRO-1409)
diff --git a/VERSION b/VERSION
index d21d277..4e8f395 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-0.25.0
+0.26.0
diff --git a/scripts/skill-audit.py b/scripts/skill-audit.py
index a486f58..dc15715 100755
--- a/scripts/skill-audit.py
+++ b/scripts/skill-audit.py
@@ -13,13 +13,15 @@
   - usage-trace scanning of Claude Code logs (~/.claude/projects/**/*.jsonl)
     rather than Codex's ~/.codex/history.jsonl
 
-Five reports:
+Six reports (1-5 are hygiene; 6 is correctness — skillify step 3, BRO-1411):
   1. Budget        — total description token cost vs ceiling (default 2% of 1M)
   2. Duplicates    — same skill name across >1 distinct realpath
   3. Registry      — coherence between companion-skills.yaml and installed roots
                      (registered-but-missing, installed-but-unregistered)
   4. Unused        — no invocation trace in recent session logs (--months window)
   5. Roots         — skill count per root
+  6. Untested      — ships deterministic code (scripts/*.{py,sh,mjs,js,ts}) but no
+                     tests; informational by default, a hard gate under --require-tests
 
 Env overrides (test fixtures):
   BSTACK_DIR                  bstack root (for default companion-skills.yaml)
@@ -152,6 +154,63 @@ def scan_usage(skill_names: list[str], log_glob: str, months: int) -> set[str]:
     return used
 
 
+CODE_EXTS = {".py", ".sh", ".mjs", ".js", ".ts"}
+
+
+def _is_test_file(name: str) -> bool:
+    return (
+        name.startswith("test_")
+        or name.endswith("_test.py")
+        or name.endswith("_test.sh")
+        or ".test." in name
+    )
+
+
+def _skill_code_files(skill_dir: Path) -> list[str]:
+    """Deterministic code files a skill ships (scripts/ + skill root, one level).
+
+    Test files are excluded — a skill whose only code IS its tests has nothing
+    left to test. Markdown-only skills return [] and are exempt from the gate.
+    """
+    found: list[str] = []
+    for sub in ("scripts", ""):
+        d = skill_dir / sub if sub else skill_dir
+        if not d.is_dir():
+            continue
+        for f in sorted(d.iterdir()):
+            if f.is_file() and f.suffix in CODE_EXTS and not _is_test_file(f.name):
+                found.append(str(f.relative_to(skill_dir)))
+    return found
+
+
+def _skill_has_tests(skill_dir: Path) -> bool:
+    """True if the skill ships any test file (tests/ or scripts/ or root, one level)."""
+    for sub in ("tests", "scripts", ""):
+        d = skill_dir / sub if sub else skill_dir
+        if not d.is_dir():
+            continue
+        for f in d.iterdir():
+            if f.is_file() and _is_test_file(f.name):
+                return True
+    return False
+
+
+def detect_untested(skills: list[dict]) -> list[dict]:
+    """Skills shipping deterministic code but no tests — skillify step 3 (BRO-1411).
+
+    The correctness counterpart to the hygiene reports: `audit` already covers
+    budget/duplicate/reachability; this covers "the script the skill runs is
+    actually tested". Markdown-only skills are exempt (no deterministic code).
+    """
+    out: list[dict] = []
+    for s in skills:
+        skill_dir = Path(s["path"]).parent
+        code = _skill_code_files(skill_dir)
+        if code and not _skill_has_tests(skill_dir):
+            out.append({"name": s["name"], "dir": str(skill_dir), "code_files": code})
+    return sorted(out, key=lambda x: x["name"])
+
+
 def main() -> int:
     ap = argparse.ArgumentParser(prog="bstack skills audit", description="Skill registry audit.")
     ap.add_argument("--roots", action="append", default=[], help="Additional skill root (repeatable).")
@@ -159,6 +218,8 @@ def main() -> int:
     ap.add_argument("--chars-per-token", type=int, default=4, help="Token-cost divisor (default 4).")
     ap.add_argument("--months", type=int, default=3, help="Usage-trace window for unused detection (default 3).")
     ap.add_argument("--no-logs", action="store_true", help="Skip usage-trace scanning.")
+    ap.add_argument("--require-tests", action="store_true",
+                    help="Gate: exit 1 if any skill ships deterministic code without tests (skillify step 3, BRO-1411).")
     ap.add_argument("--json", action="store_true", help="Machine-readable output.")
     args = ap.parse_args()
 
@@ -204,6 +265,10 @@ def main() -> int:
     for s in skills:
         root_counts[s["root"]] = root_counts.get(s["root"], 0) + 1
 
+    # 6. Untested deterministic code (skillify step 3 — correctness, not hygiene)
+    untested = detect_untested(skills)
+    gate_failed = bool(args.require_tests and untested)
+
     if args.json:
         print(json.dumps({
             "total_skills": len(skills),
@@ -213,8 +278,10 @@ def main() -> int:
             "registry": {"registered_missing": registered_missing, "installed_unregistered": installed_unregistered},
             "unused": unused,
             "roots": root_counts,
+            "untested": untested,
+            "require_tests": bool(args.require_tests),
         }, indent=2))
-        return 0
+        return 1 if gate_failed else 0
 
     # Human report
     print("# Skill Audit Report\n")
@@ -243,10 +310,21 @@ def main() -> int:
         print(f"## Unused (no trace in last {args.months}mo)  [{len(unused)}]")
         print(f"  {', '.join(unused) or '(none — all skills show recent usage)'}")
     print()
+    print(f"## Untested deterministic code  [{len(untested)}]")
+    if untested:
+        for u in untested:
+            print(f"  {u['name']}: {', '.join(u['code_files'])}")
+        if args.require_tests:
+            print(f"  ⚠ {len(untested)} skill(s) ship code without tests — --require-tests gate FAILED")
+        else:
+            print("  (informational — pass --require-tests to gate CI on this)")
+    else:
+        print("  (none — every skill with deterministic code ships tests)")
+    print()
     print("## Roots")
     for r, c in sorted(root_counts.items()):
         print(f"  {c:3d}  {r}")
-    return 0
+    return 1 if gate_failed else 0
 
 
 if __name__ == "__main__":
diff --git a/tests/skill-audit.test.sh b/tests/skill-audit.test.sh
index e661c7d..b8d2329 100755
--- a/tests/skill-audit.test.sh
+++ b/tests/skill-audit.test.sh
@@ -132,6 +132,40 @@ if echo "$out" | grep -q '## Budget' && echo "$out" | grep -q '## Duplicates' \
    && echo "$out" | grep -q '## Registry coherence' && echo "$out" | grep -q '## Unused' \
    && echo "$out" | grep -q '## Roots'; then ap "$t"; else af "$t"; fi
 
+# ── --require-tests gate (skillify step 3, BRO-1411) — separate hermetic root ──
+FX2="$(mktemp -d)"; RT_ROOT="$FX2/skills"; mkdir -p "$RT_ROOT"
+# md-only skill → exempt (no deterministic code)
+make_skill "$RT_ROOT" docs docs "Markdown only, no code."
+# code, no tests → must be flagged untested
+make_skill "$RT_ROOT" coded coded "Script but no tests."
+mkdir -p "$RT_ROOT/coded/scripts"; echo 'print(1)' > "$RT_ROOT/coded/scripts/run.py"
+# code + tests → must NOT be flagged
+make_skill "$RT_ROOT" tested tested "Script and test."
+mkdir -p "$RT_ROOT/tested/scripts" "$RT_ROOT/tested/tests"
+echo 'print(1)' > "$RT_ROOT/tested/scripts/run.py"
+echo 'def test_x(): assert True' > "$RT_ROOT/tested/tests/test_run.py"
+
+rt_audit() { BSTACK_AUDIT_ROOTS="$RT_ROOT" BSTACK_DIR="$FAKE_BSTACK" python3 "$AUDIT_PY" "$@"; }
+
+# T10: untested detection — only 'coded' flagged; 'tested' + md-only 'docs' exempt
+t="untested detection (coded flagged; tested + md-only exempt)"
+if rt_audit --json --no-logs 2>/dev/null | python3 -c "import json,sys; d=json.load(sys.stdin); u={x['name'] for x in d['untested']}; assert u=={'coded'}, u" 2>/dev/null; then ap "$t"; else af "$t"; fi
+
+# T11: --require-tests gate exits 1 when an untested skill exists
+t="--require-tests exits 1 on untested skill"
+rt_audit --no-logs --require-tests >/dev/null 2>&1; rc=$?
+if [ "$rc" -eq 1 ]; then ap "$t"; else af "$t" "rc=$rc (expected 1)"; fi
+
+# T12: without --require-tests, untested report is informational (exit 0)
+t="untested report informational without --require-tests (exit 0)"
+rt_audit --no-logs >/dev/null 2>&1; rc=$?
+if [ "$rc" -eq 0 ]; then ap "$t"; else af "$t" "rc=$rc (expected 0)"; fi
+
+# T13: human report includes the Untested section
+t="human report includes '## Untested deterministic code'"
+if rt_audit --no-logs 2>/dev/null | grep -q '## Untested deterministic code'; then ap "$t"; else af "$t"; fi
+
+rm -rf "$FX2"
 rm -rf "$FX"
 echo ""
 echo "── results: $PASS passed, $FAIL failed ────────────────────────────"