From b4ce852bfd8096025cdda5d9851a43e81a913ba5 Mon Sep 17 00:00:00 2001 From: Your Name Date: Thu, 18 Jun 2026 21:47:14 +0300 Subject: [PATCH] =?UTF-8?q?feat(ci):=20stale-run=20janitor=20=E2=80=94=20c?= =?UTF-8?q?ancel=20zombie=20Gate=20A=20runs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A dead Namespace runner leaves its Gate A run wedged `in_progress` past the timeout (GitHub never gets the completion signal); abandoned runs sit `queued` forever. Either way they keep counting against the submission governor's in-flight budget (UNSORRY_MAX_GATE_A_IN_FLIGHT), starving real verification and triggering cancellation cascades — the "70 PRs failing" that was really one zombie jam occupying the budget. Adds tools/repo/stale_runs.py + the stale-run-janitor workflow (every 20 min + manual). Cancels gate-a runs in_progress past a normal run (default 90 min) or queued long enough to be abandoned (default 180 min) — conservative limits that never touch a normal ~15 min run or a briefly-queued one. Uses the default GITHUB_TOKEN (actions: write); no REFRESH_TOKEN needed. Pure is_stale predicate is unit-tested. Co-Authored-By: Claude Opus 4.8 (1M context) --- .github/workflows/stale-run-janitor.yml | 47 +++++++++++ changelog.d/added-stale-run-janitor.md | 1 + tools/repo/stale_runs.py | 104 ++++++++++++++++++++++++ tools/repo/tests/test_stale_runs.py | 43 ++++++++++ 4 files changed, 195 insertions(+) create mode 100644 .github/workflows/stale-run-janitor.yml create mode 100644 changelog.d/added-stale-run-janitor.md create mode 100644 tools/repo/stale_runs.py create mode 100644 tools/repo/tests/test_stale_runs.py diff --git a/.github/workflows/stale-run-janitor.yml b/.github/workflows/stale-run-janitor.yml new file mode 100644 index 000000000..46f947e13 --- /dev/null +++ b/.github/workflows/stale-run-janitor.yml @@ -0,0 +1,47 @@ +# Cancel Gate A runs that are stuck — a dead Namespace runner leaves its run +# wedged `in_progress` past the timeout, and abandoned runs sit `queued` forever; +# either way they keep eating the submission governor's in-flight budget +# (UNSORRY_MAX_GATE_A_IN_FLIGHT), starving real verification and triggering +# cancellation cascades (the "70 failing" zombie jam). This keeps the budget +# honest. Uses the default GITHUB_TOKEN (actions: write) — no PR creation, so no +# REFRESH_TOKEN needed. Conservative thresholds: never touches a normal (~15 min) +# run or a briefly-queued one. +name: stale-run-janitor + +on: + schedule: + - cron: "*/20 * * * *" + workflow_dispatch: + inputs: + in_progress_minutes: + description: "Cancel in_progress Gate A runs older than this" + default: "90" + queued_minutes: + description: "Cancel queued Gate A runs older than this" + default: "180" + +concurrency: + group: stale-run-janitor + cancel-in-progress: false + +permissions: + actions: write + contents: read + +jobs: + janitor: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 + - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 + with: + python-version: "3.12" + - name: Cancel stale Gate A runs + env: + GH_TOKEN: ${{ github.token }} + run: | + python3 -m tools.repo.stale_runs \ + --workflow gate-a.yml \ + --repo "${{ github.repository }}" \ + --in-progress-minutes "${{ github.event.inputs.in_progress_minutes || '90' }}" \ + --queued-minutes "${{ github.event.inputs.queued_minutes || '180' }}" diff --git a/changelog.d/added-stale-run-janitor.md b/changelog.d/added-stale-run-janitor.md new file mode 100644 index 000000000..7d25dd000 --- /dev/null +++ b/changelog.d/added-stale-run-janitor.md @@ -0,0 +1 @@ +Added a stale-run janitor (`tools/repo/stale_runs.py` + the `stale-run-janitor` workflow, every 20 min + manual) that cancels Gate A runs stuck `in_progress` past a normal run (default 90 min) or `queued` long enough to be abandoned (default 180 min). When a Namespace runner dies mid-job, GitHub never gets the completion signal and the run wedges `in_progress` forever; abandoned runs sit `queued` indefinitely. Either way they keep counting against the submission governor's in-flight budget (`UNSORRY_MAX_GATE_A_IN_FLIGHT`), starving real verification and triggering cancellation cascades — the "many PRs failing" symptom that is really one zombie jam occupying the budget. The janitor keeps the in-flight count reflecting *live* work, so the cap means what it says and zombie-induced cancellation cascades stop recurring. Conservative thresholds never touch a normal (~15 min) run or a briefly-queued one; uses the default `GITHUB_TOKEN` (`actions: write`), so no `REFRESH_TOKEN` is required. diff --git a/tools/repo/stale_runs.py b/tools/repo/stale_runs.py new file mode 100644 index 000000000..eb687d8b2 --- /dev/null +++ b/tools/repo/stale_runs.py @@ -0,0 +1,104 @@ +"""Stale-run janitor — cancel Gate A runs that are stuck (ADR-058 hygiene). + +A Namespace runner that dies mid-job leaves its workflow run wedged `in_progress` +(GitHub never gets the completion signal, so it sits there past the timeout), and +abandoned runs can sit `queued` forever waiting for a runner that will never come. +Either way they keep counting against the submission governor's in-flight budget +(`UNSORRY_MAX_GATE_A_IN_FLIGHT`), starving real verification and triggering +cancellation cascades — the "70 PRs failing" that was really one zombie jam. + +This cancels such runs so the in-flight budget reflects *live* work. Conservative +by design: `in_progress` is only stale well past a normal run (default 90 min; +normal incremental Gate A is ~15 min), and `queued` only when it's clearly +abandoned (default 180 min; legitimate queueing waits minutes, not hours) — so it +never kills a genuinely-running or briefly-queued job. +""" +from __future__ import annotations + +import argparse +import json +import subprocess +import sys +from datetime import datetime, timezone + + +def _parse_iso(value: str) -> datetime: + return datetime.fromisoformat(value.replace("Z", "+00:00")) + + +def age_minutes(created_at: str, now: datetime) -> float: + return (now - _parse_iso(created_at)).total_seconds() / 60.0 + + +def is_stale(status: str, created_at: str, now: datetime, + in_progress_max: float, queued_max: float) -> bool: + """Pure predicate: a run is stale when its age exceeds the per-status limit.""" + age = age_minutes(created_at, now) + if status == "in_progress": + return age > in_progress_max + if status == "queued": + return age > queued_max + return False + + +def _active_runs(workflow: str, repo: str | None) -> list[dict]: + runs: list[dict] = [] + for status in ("in_progress", "queued"): + cmd = ["gh", "run", "list", "--workflow", workflow, "--status", status, + "--limit", "200", "--json", "databaseId,createdAt,status"] + if repo: + cmd += ["--repo", repo] + proc = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, + text=True, check=False) + if proc.returncode != 0: + print(f"warning: gh run list ({status}) failed: {proc.stderr.strip()}", + file=sys.stderr) + continue + for r in json.loads(proc.stdout or "[]"): + r["status"] = status # gh sometimes blanks status in the list payload + runs.append(r) + return runs + + +def _cancel(run_id: int, repo: str | None) -> bool: + cmd = ["gh", "run", "cancel", str(run_id)] + if repo: + cmd += ["--repo", repo] + return subprocess.run(cmd, stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, check=False).returncode == 0 + + +def main(argv: list[str] | None = None) -> int: + ap = argparse.ArgumentParser( + prog="python3 -m tools.repo.stale_runs", + description="Cancel Gate A runs stuck in_progress/queued past a limit.") + ap.add_argument("--workflow", default="gate-a.yml") + ap.add_argument("--repo", default=None, help="owner/name (default: gh's repo)") + ap.add_argument("--in-progress-minutes", type=float, default=90.0) + ap.add_argument("--queued-minutes", type=float, default=180.0) + ap.add_argument("--dry-run", action="store_true") + args = ap.parse_args(argv) + + now = datetime.now(timezone.utc) + stale = [r for r in _active_runs(args.workflow, args.repo) + if is_stale(r["status"], r["createdAt"], now, + args.in_progress_minutes, args.queued_minutes)] + cancelled = 0 + for r in stale: + rid, age = r["databaseId"], age_minutes(r["createdAt"], now) + if args.dry_run: + print(f"would cancel {rid} ({r['status']}, {age:.0f}m)") + continue + if _cancel(rid, args.repo): + cancelled += 1 + print(f"cancelled {rid} ({r['status']}, {age:.0f}m)") + else: + print(f"failed to cancel {rid}", file=sys.stderr) + print(f"stale-run janitor: {len(stale)} stale, {cancelled} cancelled " + f"({args.workflow}; in_progress>{args.in_progress_minutes:.0f}m " + f"queued>{args.queued_minutes:.0f}m)") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tools/repo/tests/test_stale_runs.py b/tools/repo/tests/test_stale_runs.py new file mode 100644 index 000000000..337863455 --- /dev/null +++ b/tools/repo/tests/test_stale_runs.py @@ -0,0 +1,43 @@ +from datetime import datetime, timezone + +from tools.repo import stale_runs + +NOW = datetime(2026, 6, 18, 18, 30, tzinfo=timezone.utc) + + +def _ago(minutes): + from datetime import timedelta + return (NOW - timedelta(minutes=minutes)).strftime("%Y-%m-%dT%H:%M:%SZ") + + +def test_in_progress_zombie_is_stale(): + # 6h in_progress (the real zombie shape) is stale. + assert stale_runs.is_stale("in_progress", _ago(360), NOW, 90, 180) is True + + +def test_in_progress_normal_run_is_not_stale(): + # A ~15 min run mid-flight is NOT touched. + assert stale_runs.is_stale("in_progress", _ago(15), NOW, 90, 180) is False + + +def test_in_progress_just_under_limit_kept(): + assert stale_runs.is_stale("in_progress", _ago(89), NOW, 90, 180) is False + assert stale_runs.is_stale("in_progress", _ago(91), NOW, 90, 180) is True + + +def test_queued_briefly_is_not_stale(): + # Legitimate queueing (waiting for a runner) must never be cancelled. + assert stale_runs.is_stale("queued", _ago(45), NOW, 90, 180) is False + + +def test_queued_abandoned_is_stale(): + # Queued for hours = the runner will never come; abandon it. + assert stale_runs.is_stale("queued", _ago(240), NOW, 90, 180) is True + + +def test_completed_status_never_stale(): + assert stale_runs.is_stale("completed", _ago(999), NOW, 90, 180) is False + + +def test_age_minutes(): + assert round(stale_runs.age_minutes(_ago(90), NOW)) == 90