Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 47 additions & 0 deletions .github/workflows/stale-run-janitor.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# Cancel Gate A runs that are stuck — a dead Namespace runner leaves its run
# wedged `in_progress` past the timeout, and abandoned runs sit `queued` forever;
# either way they keep eating the submission governor's in-flight budget
# (UNSORRY_MAX_GATE_A_IN_FLIGHT), starving real verification and triggering
# cancellation cascades (the "70 failing" zombie jam). This keeps the budget
# honest. Uses the default GITHUB_TOKEN (actions: write) — no PR creation, so no
# REFRESH_TOKEN needed. Conservative thresholds: never touches a normal (~15 min)
# run or a briefly-queued one.
name: stale-run-janitor

on:
schedule:
- cron: "*/20 * * * *"
workflow_dispatch:
inputs:
in_progress_minutes:
description: "Cancel in_progress Gate A runs older than this"
default: "90"
queued_minutes:
description: "Cancel queued Gate A runs older than this"
default: "180"

concurrency:
group: stale-run-janitor
cancel-in-progress: false

permissions:
actions: write
contents: read

jobs:
janitor:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
- uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
with:
python-version: "3.12"
- name: Cancel stale Gate A runs
env:
GH_TOKEN: ${{ github.token }}
run: |
python3 -m tools.repo.stale_runs \
--workflow gate-a.yml \
--repo "${{ github.repository }}" \
--in-progress-minutes "${{ github.event.inputs.in_progress_minutes || '90' }}" \
--queued-minutes "${{ github.event.inputs.queued_minutes || '180' }}"
1 change: 1 addition & 0 deletions changelog.d/added-stale-run-janitor.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Added a stale-run janitor (`tools/repo/stale_runs.py` + the `stale-run-janitor` workflow, every 20 min + manual) that cancels Gate A runs stuck `in_progress` past a normal run (default 90 min) or `queued` long enough to be abandoned (default 180 min). When a Namespace runner dies mid-job, GitHub never gets the completion signal and the run wedges `in_progress` forever; abandoned runs sit `queued` indefinitely. Either way they keep counting against the submission governor's in-flight budget (`UNSORRY_MAX_GATE_A_IN_FLIGHT`), starving real verification and triggering cancellation cascades — the "many PRs failing" symptom that is really one zombie jam occupying the budget. The janitor keeps the in-flight count reflecting *live* work, so the cap means what it says and zombie-induced cancellation cascades stop recurring. Conservative thresholds never touch a normal (~15 min) run or a briefly-queued one; uses the default `GITHUB_TOKEN` (`actions: write`), so no `REFRESH_TOKEN` is required.
104 changes: 104 additions & 0 deletions tools/repo/stale_runs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
"""Stale-run janitor — cancel Gate A runs that are stuck (ADR-058 hygiene).

A Namespace runner that dies mid-job leaves its workflow run wedged `in_progress`
(GitHub never gets the completion signal, so it sits there past the timeout), and
abandoned runs can sit `queued` forever waiting for a runner that will never come.
Either way they keep counting against the submission governor's in-flight budget
(`UNSORRY_MAX_GATE_A_IN_FLIGHT`), starving real verification and triggering
cancellation cascades — the "70 PRs failing" that was really one zombie jam.

This cancels such runs so the in-flight budget reflects *live* work. Conservative
by design: `in_progress` is only stale well past a normal run (default 90 min;
normal incremental Gate A is ~15 min), and `queued` only when it's clearly
abandoned (default 180 min; legitimate queueing waits minutes, not hours) — so it
never kills a genuinely-running or briefly-queued job.
"""
from __future__ import annotations

import argparse
import json
import subprocess
import sys
from datetime import datetime, timezone


def _parse_iso(value: str) -> datetime:
return datetime.fromisoformat(value.replace("Z", "+00:00"))


def age_minutes(created_at: str, now: datetime) -> float:
return (now - _parse_iso(created_at)).total_seconds() / 60.0


def is_stale(status: str, created_at: str, now: datetime,
in_progress_max: float, queued_max: float) -> bool:
"""Pure predicate: a run is stale when its age exceeds the per-status limit."""
age = age_minutes(created_at, now)
if status == "in_progress":
return age > in_progress_max
if status == "queued":
return age > queued_max
return False


def _active_runs(workflow: str, repo: str | None) -> list[dict]:
runs: list[dict] = []
for status in ("in_progress", "queued"):
cmd = ["gh", "run", "list", "--workflow", workflow, "--status", status,
"--limit", "200", "--json", "databaseId,createdAt,status"]
if repo:
cmd += ["--repo", repo]
proc = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
text=True, check=False)
if proc.returncode != 0:
print(f"warning: gh run list ({status}) failed: {proc.stderr.strip()}",
file=sys.stderr)
continue
for r in json.loads(proc.stdout or "[]"):
r["status"] = status # gh sometimes blanks status in the list payload
runs.append(r)
return runs


def _cancel(run_id: int, repo: str | None) -> bool:
cmd = ["gh", "run", "cancel", str(run_id)]
if repo:
cmd += ["--repo", repo]
return subprocess.run(cmd, stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL, check=False).returncode == 0


def main(argv: list[str] | None = None) -> int:
ap = argparse.ArgumentParser(
prog="python3 -m tools.repo.stale_runs",
description="Cancel Gate A runs stuck in_progress/queued past a limit.")
ap.add_argument("--workflow", default="gate-a.yml")
ap.add_argument("--repo", default=None, help="owner/name (default: gh's repo)")
ap.add_argument("--in-progress-minutes", type=float, default=90.0)
ap.add_argument("--queued-minutes", type=float, default=180.0)
ap.add_argument("--dry-run", action="store_true")
args = ap.parse_args(argv)

now = datetime.now(timezone.utc)
stale = [r for r in _active_runs(args.workflow, args.repo)
if is_stale(r["status"], r["createdAt"], now,
args.in_progress_minutes, args.queued_minutes)]
cancelled = 0
for r in stale:
rid, age = r["databaseId"], age_minutes(r["createdAt"], now)
if args.dry_run:
print(f"would cancel {rid} ({r['status']}, {age:.0f}m)")
continue
if _cancel(rid, args.repo):
cancelled += 1
print(f"cancelled {rid} ({r['status']}, {age:.0f}m)")
else:
print(f"failed to cancel {rid}", file=sys.stderr)
print(f"stale-run janitor: {len(stale)} stale, {cancelled} cancelled "
f"({args.workflow}; in_progress>{args.in_progress_minutes:.0f}m "
f"queued>{args.queued_minutes:.0f}m)")
return 0


if __name__ == "__main__":
raise SystemExit(main())
43 changes: 43 additions & 0 deletions tools/repo/tests/test_stale_runs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
from datetime import datetime, timezone

from tools.repo import stale_runs

NOW = datetime(2026, 6, 18, 18, 30, tzinfo=timezone.utc)


def _ago(minutes):
from datetime import timedelta
return (NOW - timedelta(minutes=minutes)).strftime("%Y-%m-%dT%H:%M:%SZ")


def test_in_progress_zombie_is_stale():
# 6h in_progress (the real zombie shape) is stale.
assert stale_runs.is_stale("in_progress", _ago(360), NOW, 90, 180) is True


def test_in_progress_normal_run_is_not_stale():
# A ~15 min run mid-flight is NOT touched.
assert stale_runs.is_stale("in_progress", _ago(15), NOW, 90, 180) is False


def test_in_progress_just_under_limit_kept():
assert stale_runs.is_stale("in_progress", _ago(89), NOW, 90, 180) is False
assert stale_runs.is_stale("in_progress", _ago(91), NOW, 90, 180) is True


def test_queued_briefly_is_not_stale():
# Legitimate queueing (waiting for a runner) must never be cancelled.
assert stale_runs.is_stale("queued", _ago(45), NOW, 90, 180) is False


def test_queued_abandoned_is_stale():
# Queued for hours = the runner will never come; abandon it.
assert stale_runs.is_stale("queued", _ago(240), NOW, 90, 180) is True


def test_completed_status_never_stale():
assert stale_runs.is_stale("completed", _ago(999), NOW, 90, 180) is False


def test_age_minutes():
assert round(stale_runs.age_minutes(_ago(90), NOW)) == 90
Loading