diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS new file mode 100644 index 0000000..3de3291 --- /dev/null +++ b/.github/CODEOWNERS @@ -0,0 +1,42 @@ +# CODEOWNERS for allora-network/.github (DEVOP-560) +# +# Detector-disable-via-PR is the failure mode this file guards against: +# a single PR that touches the daily Shai-Hulud IOC sweep workflow, the +# vendored detector script, the IOC seed lists, or the SHA-256 integrity +# sidecar can silently turn the daily detection off if no security review +# is enforced. CODEOWNERS makes review by `@allora-network/security` +# (and, for the workflow itself, `@allora-network/devops`) the in-repo +# gate. The complementary branch-protection rule ("Require review from +# Code Owners") is an org-admin task tracked in +# docs/plans/2026-05-25-devop-560-shai-hulud-sweep.md. +# +# Team-name caveat: teams are pinned to the slugs verified on +# 2026-05-25 via `gh api orgs/allora-network/teams/{security,devops}`. +# If either team is renamed or restructured, update this file in lock- +# step or the rule silently degrades to "no required reviewer" for the +# affected path. +# +# Last entry wins per path. The order below is "broadest fallback first, +# narrowest security-critical paths last" so the security/devops rules +# take precedence over any future generic catch-alls added above them. + +# Default owner for the rest of the repo — devops carries the +# `.github` operations surface. (Add `@allora-network/`-style +# entries above this default if a new path-specific owner appears.) +* @allora-network/devops + +# Shai-Hulud detection surface — require security review on any change +# to the workflow, the vendored detector script, the SHA-256 integrity +# sidecar, the IOC seed files, or the security folder generally. Devops +# is co-owner on the workflow so day-to-day operational tweaks +# (concurrency, retention, etc.) don't block on security availability. +/.github/workflows/shai-hulud-sweep.yml @allora-network/security @allora-network/devops +/scripts/shai-hulud-ioc-sweep.sh @allora-network/security +/scripts/shai-hulud-ioc-sweep.sh.sha256 @allora-network/security +/.github/security/ @allora-network/security + +# This file itself — require security review to change ownership rules +# for any of the paths above. Without this entry, a PR could rewrite +# CODEOWNERS to remove the security reviewer for the workflow in the +# same PR that disables detection. +/.github/CODEOWNERS @allora-network/security diff --git a/.github/security/ioc-hashes.txt b/.github/security/ioc-hashes.txt index 6afc182..3783645 100644 --- a/.github/security/ioc-hashes.txt +++ b/.github/security/ioc-hashes.txt @@ -1,3 +1,4 @@ +# schema:v1 # IOC SHA-256 hash list — known Shai-Hulud dropper payloads. # # Format: one lowercase hex SHA-256 per line, optional `# comment` after. diff --git a/.github/security/ioc-packages.txt b/.github/security/ioc-packages.txt index f508172..d4f2030 100644 --- a/.github/security/ioc-packages.txt +++ b/.github/security/ioc-packages.txt @@ -1,3 +1,4 @@ +# schema:v1 # IOC package list — Shai-Hulud / Socket-tracked compromised npm + PyPI releases. # # Format: one line per :@ diff --git a/.github/workflows/shai-hulud-sweep.yml b/.github/workflows/shai-hulud-sweep.yml new file mode 100644 index 0000000..c39841f --- /dev/null +++ b/.github/workflows/shai-hulud-sweep.yml @@ -0,0 +1,800 @@ +name: Shai-Hulud IOC Sweep + +# Daily org-wide sweep for Shai-Hulud indicators of compromise. +# Reference: DEVOP-560. Canonical detection logic lives in +# scripts/shai-hulud-ioc-sweep.sh (vendored from +# allora-network/skills @ skills/shai-hulud-defense/scripts/shai-hulud-ioc-sweep.sh +# — see the file header for refresh procedure / pinned commit). +# +# Outputs: +# - clean (script exit 0): no-op (no issue update, no Slack page). +# - operational (exit 2): append a comment to the rolling issue labelled +# `shai-hulud-sweep`, or open a new one if none. +# - IOC-grade (exit 1): same rolling-issue update + page Slack via the +# `SLACK_SECURITY_WEBHOOK` org secret. +# +# Humans drive close-and-reopen of the rolling issue so triage state is +# preserved across runs; the workflow never auto-closes. + +on: + schedule: + # 04:07 UTC daily — off-peak + off-minute to dodge GitHub Actions cron + # contention spikes on whole-hour boundaries. + - cron: '7 4 * * *' + workflow_dispatch: + +permissions: + contents: read + issues: write + +# Serialize concurrent runs so a manual `workflow_dispatch` while the daily +# cron is mid-sweep can't race on the rolling issue / Slack page. Never +# cancel an in-progress sweep — partial scans are more dangerous than +# delayed ones (they false-clean unscanned repos). +concurrency: + group: shai-hulud-sweep + cancel-in-progress: false + +jobs: + sweep: + name: Sweep allora-network for IOCs + runs-on: ubuntu-latest + timeout-minutes: 60 + env: + ORG: allora-network + ROLLING_LABEL: shai-hulud-sweep + # Prefer a dedicated `GH_ORG_READ_TOKEN` secret (fine-grained PAT or + # GitHub App token with `read:org` + repo:read) when provisioned — + # required for private-repo enumeration and member exfil search. Falls + # back to the workflow's default GITHUB_TOKEN, which can only see + # public org repos and lacks `read:org`; in that mode the script + # emits `check_skipped` operational findings for the members API so + # the partial-coverage state is visible in the rolling issue, never + # silently false-cleaned. See docs/plans/2026-05-25-devop-560-*.md. + GH_TOKEN: ${{ secrets.GH_ORG_READ_TOKEN || secrets.GITHUB_TOKEN }} + + steps: + - name: Checkout .github repo (for IOC lists + sweep script) + # SHA pin: actions/checkout v4.2.2 (Oct 2024). Matches the pin used + # in allora-network/ci-workflows-private hardened reusable workflows. + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 + with: + persist-credentials: false + + - name: Verify required tools + shell: bash + run: | + set -euo pipefail + gh --version + jq --version + sha256sum --version | head -1 + git --version + + - name: Run Shai-Hulud IOC sweep + id: sweep + shell: bash + # The script exits 0 (clean) / 1 (IOC) / 2 (operational). We deliberately + # do NOT let a 1/2 exit fail the step — those are reported via the + # rolling-issue + Slack page steps below, not by failing the workflow + # (which would obscure the structured output behind a red ❌ and break + # the daily-cron observability story). Only true infra errors should + # fail this step. + run: | + set -uo pipefail + # Integrity gate: the script is vendored verbatim from upstream + # (allora-network/skills). Verify the committed SHA-256 sidecar + # matches the on-disk file before invoking it — any PR that + # modifies the script body without refreshing the sidecar fails + # this step loudly instead of executing a tampered detector. + # Refresh: `shasum -a 256 scripts/shai-hulud-ioc-sweep.sh > scripts/shai-hulud-ioc-sweep.sh.sha256` + # + # Pin the sidecar's path field before invoking `sha256sum --check`. + # `sha256sum --check` parses each manifest line as ` ` + # and resolves relative to cwd — it does NOT enforce that + # equals scripts/shai-hulud-ioc-sweep.sh. Without this + # guard, an attacker who can mutate the sidecar (CODEOWNERS gate + # requires security review, but defense-in-depth) could repoint + # the sidecar at any other in-tree file whose hash they control + # (e.g. LICENSE, or the sidecar itself) and pass the gate while + # the actual detector script is silently swapped to a no-op. + # Lock the gate to the single expected ` ` line. + if ! grep -qE '^[0-9a-f]{64} scripts/shai-hulud-ioc-sweep\.sh$' scripts/shai-hulud-ioc-sweep.sh.sha256; then + echo "::error::Sidecar path validation failed for scripts/shai-hulud-ioc-sweep.sh.sha256 — expected exactly one line of the form ' scripts/shai-hulud-ioc-sweep.sh'. Refusing to execute (sidecar may have been retargeted at a different file)." + exit 1 + fi + if [ "$(wc -l < scripts/shai-hulud-ioc-sweep.sh.sha256 | tr -d ' ')" != "1" ]; then + echo "::error::Sidecar scripts/shai-hulud-ioc-sweep.sh.sha256 must contain exactly one manifest line. Refusing to execute." + exit 1 + fi + if ! sha256sum --check --status scripts/shai-hulud-ioc-sweep.sh.sha256; then + echo "::error::Integrity check failed for scripts/shai-hulud-ioc-sweep.sh — committed SHA-256 sidecar does not match the file on disk. Refusing to execute." + exit 1 + fi + chmod +x scripts/shai-hulud-ioc-sweep.sh + # Pin OUTPUT_DIR so subsequent steps can find findings.json and + # summary.md by a known path. Default of $(date) would change + # between steps. + OUTPUT_DIR="${GITHUB_WORKSPACE}/.shai-hulud-sweep" + export OUTPUT_DIR + mkdir -p "$OUTPUT_DIR" + echo "output_dir=$OUTPUT_DIR" >> "$GITHUB_OUTPUT" + set +e + ./scripts/shai-hulud-ioc-sweep.sh "$ORG" \ + .github/security/ioc-packages.txt \ + .github/security/ioc-hashes.txt + rc=$? + set -e + { + echo "rc=$rc" + echo "summary_path=${OUTPUT_DIR}/summary.md" + echo "findings_path=${OUTPUT_DIR}/findings.json" + } >> "$GITHUB_OUTPUT" + echo "Sweep exit code: $rc" + if [ -s "${OUTPUT_DIR}/summary.md" ]; then + echo "::group::Sweep summary" + cat "${OUTPUT_DIR}/summary.md" + echo "::endgroup::" + else + echo "::warning::sweep produced no summary.md — the script likely failed before aggregation." + fi + # Step succeeds for rc in {0,1,2}. Anything else is treated as + # infrastructure failure (script crashed, missing tool, etc.) and + # surfaces as a red workflow run for human attention. + case "$rc" in + 0|1|2) exit 0 ;; + *) exit "$rc" ;; + esac + + - name: Upload sweep artifacts (findings only) + if: always() && steps.sweep.outputs.output_dir != '' + # SHA pin: actions/upload-artifact v4.4.3 (Oct 2024). + # + # Artifact scope is intentionally narrow: only the structured + # findings + summary + repo list. Anyone with `actions: read` on + # this repo can download workflow artifacts, but the org includes + # private repositories, and uploading raw clones (`clones/`) or + # preserved evidence trees (`evidence/`) would expose private-repo + # source code to anyone who can read this workflow's runs. + # Privileged incident responders can re-clone any flagged repo on + # demand from the findings — the runner just needs to surface + # which repos matched, not their full contents. + uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 + with: + name: shai-hulud-sweep-${{ github.run_id }}-${{ github.run_attempt }} + path: | + ${{ steps.sweep.outputs.output_dir }}/findings.json + ${{ steps.sweep.outputs.output_dir }}/summary.md + ${{ steps.sweep.outputs.output_dir }}/repos.txt + if-no-files-found: warn + retention-days: 30 + + - name: Find rolling issue + # Resolve the rolling-issue number ONCE per run and reuse it across + # the ioc-dedup, rolling-issue-update, and Slack-paged-marker steps. + # Removes the duplicated `gh issue list` query that previously lived + # in both ioc-dedup and the update step (drift hazard: a future + # tweak to one query without the other would mis-locate the rolling + # issue on one surface) and closes a TOCTOU window where a human + # could close the rolling issue between the two queries, sending + # the second consumer to a stale/empty issue_num. + # + # Gated on `rc == '1' || rc == '2'` so this step matches the union + # of consumers below: ioc-dedup (rc=1), the update step (rc=1|2), + # and the new paged-at marker step (rc=1 + Slack-paged). Clean + # runs (rc=0) skip — there's nothing to attach. + # + # Empty `issue_num` is a valid state (first ever non-clean run, + # rolling issue not yet created); the update step's gh-issue-create + # path below handles that case. + if: always() && (steps.sweep.outputs.rc == '1' || steps.sweep.outputs.rc == '2') + id: find-rolling-issue + shell: bash + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + set -euo pipefail + # `gh issue list --search "... sort:created-asc"` mirrors the + # update step's lookup (oldest open issue with the rolling label + # wins, so a long-running incident issue stays canonical even if + # someone files a duplicate). `--limit 1` + `.[0].number // empty` + # gives an empty string when no rolling issue exists yet. + issue_num="$(gh issue list \ + --search "label:\"$ROLLING_LABEL\" state:open sort:created-asc" \ + --limit 1 \ + --json number \ + --jq '.[0].number // empty' 2>/dev/null || true)" + echo "issue_num=${issue_num}" >> "$GITHUB_OUTPUT" + if [ -n "$issue_num" ]; then + echo "::notice::Resolved rolling issue #${issue_num} for label=${ROLLING_LABEL}." + else + echo "::notice::No open rolling issue found for label=${ROLLING_LABEL}; update step will create one." + fi + + - name: Compute IOC stamp and Slack dedup decision + # Decide whether the IOC-grade Slack page fires for this run. Without + # this gate the bare `rc == '1'` Slack step pages on EVERY IOC-grade + # run, so a standing unresolved IOC pages the channel daily and + # conditions responders to mute it — textbook alert fatigue. See + # PRRT_kwDOLZ5Xss6Ee5gN (cubic, DEVOP-560) and the original + # ce-code-review P1 finding (anchor 100) corroborating across four + # reviewers. + # + # Decision policy: + # - first IOC-grade run after clean (no prior stamp) → page + # - today's IOC set differs from the previous stamp → page + # - same IOC set as previous, but >= WEEKLY_REPAGE_S old → page + # (so a forgotten standing IOC doesn't silently age out) + # - otherwise → skip + # + # State persistence: hidden HTML markers embedded in IOC-grade + # rolling-issue comments by the next step below + # ( and + # ). Comments are durable across + # runs and survive issue close/reopen as long as the same issue is + # reused; when humans rotate to a fresh rolling issue, the missing + # prior stamp correctly forces a page on the next IOC-grade run. + # + # Stamp content: sha256 of sorted `{repo,rule,path,detail}` TSV of + # IOC-grade rows from findings.json. `ts` is intentionally excluded + # because it's per-run and would force a fresh stamp every cycle, + # defeating dedup entirely. IOC_RULES_RE MUST stay in sync with + # scripts/shai-hulud-ioc-sweep.sh (search for IOC_RULES_RE) — a + # silent drift would either mis-dedup (skip a real new IOC) or + # mis-page (re-page on operational-only changes). + if: always() && steps.sweep.outputs.rc == '1' + id: ioc-dedup + shell: bash + env: + FINDINGS_PATH: ${{ steps.sweep.outputs.findings_path }} + FIND_ROLLING_ISSUE_NUM: ${{ steps.find-rolling-issue.outputs.issue_num }} + # `issues:read` only — the same scope used by the rolling-issue + # update step below. Keep narrow. + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + WEEKLY_REPAGE_S: '604800' # 7 * 24 * 3600 + run: | + set -euo pipefail + IOC_RULES_RE='^(ioc_package_match|ioc_bundle_hash|persistence_workflow|suspicious_lifecycle_script|public_exfil_repo|public_exfil_repo_member|go_suspicious_replace|go_replace_path_mismatch|go_unsafe_env|go_unsafe_env_indirect)$' + + current_stamp="" + if [ -s "$FINDINGS_PATH" ]; then + # `|| true` keeps `set -euo pipefail` from aborting the step when + # the jq pipeline fails (malformed findings.json, mid-run file + # mutation, jq filter error). The empty/garbage `current_stamp` + # then routes through the documented fail-open guard below + # (`if [ -z "$current_stamp" ] || ! grep -qE '^[0-9a-f]{64}$'`) + # which warns and pages Slack instead of silently fail-CLOSED. + # Mirrors the `|| true` already present on the four sibling + # pipelines in this same step (gh issue list, gh api comments + # fetch, prev_stamp scan, prev_paged_at scan). + current_stamp="$(jq -r --arg re "$IOC_RULES_RE" ' + [.[] + | select(.rule | test($re)) + | "\(.repo)\t\(.rule)\t\(.path)\t\(.detail)"] + | sort + | .[] + ' "$FINDINGS_PATH" 2>/dev/null \ + | sha256sum | awk '{print $1}' || true)" + fi + if [ -z "$current_stamp" ] \ + || ! printf '%s' "$current_stamp" | grep -qE '^[0-9a-f]{64}$'; then + # rc=1 implies IOC_COUNT > 0 in the script, so an empty/invalid + # stamp here indicates a precondition failure (missing + # findings.json, pre-aggregation crash). Fail OPEN — page Slack + # so an unknown-state run surfaces visibly rather than silently + # dedup-skipping. + echo "::warning::Cannot compute IOC stamp from $FINDINGS_PATH; defaulting to page Slack (fail-open)." + { + echo "current_stamp=" + echo "prev_stamp=" + echo "prev_paged_at=" + echo "should_page=true" + echo "decision_reason=stamp-unknown" + } >> "$GITHUB_OUTPUT" + exit 0 + fi + + # Rolling issue number resolved by the upstream find-rolling-issue + # step (single canonical lookup; eliminates the prior drift hazard + # of two independent `gh issue list` queries). Empty when no + # rolling issue exists yet (first ever IOC run); prev_* stay empty + # and we page. + issue_num="${FIND_ROLLING_ISSUE_NUM:-}" + + prev_stamp="" + prev_paged_at="" + if [ -n "$issue_num" ]; then + # Dump comments authored by `github-actions[bot]` ONLY across + # all pages, then sort cross-page by created_at desc and pick + # the most recent marker occurrence. Filtering by author closes + # the marker-spoofing channel: without it, anyone with `issues: + # write` (or anyone able to social-engineer a maintainer into + # pasting attacker-supplied marker text) could inject + # `` or + # `` into the + # rolling issue and silently suppress real Slack pages by + # poisoning the dedup chain. Only this workflow (running as + # GITHUB_TOKEN) emits the canonical markers, and its comments + # are authored by `github-actions[bot]` — restrict the source + # set accordingly. + # `gh api --paginate --jq '.[] | ...'` emits one JSON object + # per comment per page; jq -s re-slurps them so sort_by spans + # the whole history (per-page sorts would miss the global + # most-recent on multi-page rolling issues). + comments_nd="$(gh api --paginate \ + "repos/${GITHUB_REPOSITORY}/issues/${issue_num}/comments" \ + --jq '.[] | select(.user.login == "github-actions[bot]") | {body: .body, created_at: .created_at}' \ + 2>/dev/null || true)" + if [ -n "$comments_nd" ]; then + prev_stamp="$(printf '%s\n' "$comments_nd" \ + | jq -rs ' + sort_by(.created_at) | reverse + | .[].body + | scan("") + ' 2>/dev/null \ + | head -1 \ + | sed -E 's/^$/\1/' \ + || true)" + prev_paged_at="$(printf '%s\n' "$comments_nd" \ + | jq -rs ' + sort_by(.created_at) | reverse + | .[].body + | scan("") + ' 2>/dev/null \ + | head -1 \ + | sed -E 's/^$/\1/' \ + || true)" + fi + fi + + should_page="false" + decision_reason="dedup-skip" + if [ -z "$prev_stamp" ]; then + should_page="true"; decision_reason="first-ioc-run" + elif [ "$prev_stamp" != "$current_stamp" ]; then + should_page="true"; decision_reason="ioc-changed" + elif [ -z "$prev_paged_at" ]; then + # Stamp continuity exists but we have no paged-at record (e.g., + # a maintainer hand-edited the rolling issue, or an older + # comment predates this gate). Be conservative — page so a + # silent standing IOC can't slip through the dedup window. + should_page="true"; decision_reason="paged-at-missing" + else + now_epoch="$(date -u +%s)" + prev_epoch="$(date -u -d "$prev_paged_at" +%s 2>/dev/null || echo 0)" + age_s=$((now_epoch - prev_epoch)) + if [ "$age_s" -ge "$WEEKLY_REPAGE_S" ]; then + should_page="true"; decision_reason="weekly-repage" + fi + fi + + { + echo "current_stamp=$current_stamp" + echo "prev_stamp=$prev_stamp" + echo "prev_paged_at=$prev_paged_at" + echo "should_page=$should_page" + echo "decision_reason=$decision_reason" + } >> "$GITHUB_OUTPUT" + echo "::notice::IOC dedup: should_page=$should_page reason=$decision_reason stamp=$current_stamp prev_stamp=${prev_stamp:-} prev_paged_at=${prev_paged_at:-}" + + - name: Update rolling issue with findings + # Update the rolling issue on any non-clean run (operational OR IOC). + # Clean runs (exit 0) leave the issue alone — humans drive close / + # reopen so triage state survives across daily runs. + # + # `always()` keeps the rolling-issue update coupled to the sweep + # script's exit code only — decoupled from upstream step success. + # Without it, a flaky `Upload sweep artifacts` step (or any other + # earlier non-`always()` step that ends up failing) silently + # suppresses the operational-finding signal, leaving an exit-2 run + # invisible everywhere except the workflow run list. Mirrors the + # Slack page step's `always()` gating below. + if: always() && (steps.sweep.outputs.rc == '1' || steps.sweep.outputs.rc == '2') + shell: bash + env: + SWEEP_RC: ${{ steps.sweep.outputs.rc }} + SUMMARY_PATH: ${{ steps.sweep.outputs.summary_path }} + # Slack dedup state from the ioc-dedup step. Empty/unset for rc=2 + # runs (dedup step is gated on rc=1), which intentionally skips + # marker emission below so operational comments don't poison the + # IOC dedup chain. + IOC_STAMP: ${{ steps.ioc-dedup.outputs.current_stamp }} + SHOULD_PAGE: ${{ steps.ioc-dedup.outputs.should_page }} + DECISION_REASON: ${{ steps.ioc-dedup.outputs.decision_reason }} + PREV_PAGED_AT: ${{ steps.ioc-dedup.outputs.prev_paged_at }} + FIND_ROLLING_ISSUE_NUM: ${{ steps.find-rolling-issue.outputs.issue_num }} + # Issue ops are scoped to THIS repo (allora-network/.github) and + # only need the workflow's own GITHUB_TOKEN with `issues: write`. + # A wider GH_ORG_READ_TOKEN (used above for org enumeration) may + # or may not include issues:write here — keep this scoped narrow. + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + set -euo pipefail + if [ ! -s "$SUMMARY_PATH" ]; then + # Script's precondition gates (missing IOC seed files, missing + # required tooling, pre-aggregation crash) can exit non-zero + # before summary.md is written. The workflow's documented + # contract is "rc != 0 → rolling-issue update"; silently + # exiting here would break that contract and let an operational + # failure go un-triaged. Substitute a minimal placeholder so + # the issue gets the rc and run-URL even when there's no + # script-generated summary to attach. + echo "::warning::sweep summary missing or empty at $SUMMARY_PATH; emitting precondition-failed placeholder for rolling-issue update." + SUMMARY_PATH=/tmp/sweep-summary-placeholder.md + printf 'precondition failed: sweep script exited %s before producing summary.md. See workflow logs for the underlying error (missing IOC seed files, missing dependency, or pre-aggregation crash).\n' "$SWEEP_RC" \ + > "$SUMMARY_PATH" + fi + + ts="$(date -u +%Y-%m-%dT%H:%M:%SZ)" + run_url="${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}" + if [ "$SWEEP_RC" = "1" ]; then + grade="IOC findings (incident-grade)" + else + grade="operational findings only" + fi + + # Visible Slack-decision footer for IOC-grade runs so a human + # scanning the rolling issue can see at a glance whether THIS run + # paged Slack and why — without paging the channel needs a + # rendered cue, not just a hidden HTML marker. + slack_line="" + if [ "$SWEEP_RC" = "1" ]; then + case "${SHOULD_PAGE:-}" in + true) + slack_line="- **Slack page:** yes (reason: \`${DECISION_REASON:-unknown}\`)" + ;; + false) + slack_line="- **Slack page:** suppressed (reason: \`${DECISION_REASON:-unknown}\`, prev paged \`${PREV_PAGED_AT:-unknown}\`; weekly re-page will fire if standing IOC persists)" + ;; + *) + slack_line="- **Slack page:** decision unavailable (ioc-dedup step did not run)" + ;; + esac + fi + + { + echo "### Sweep run — $ts" + echo + echo "- **Result:** $grade (script exit \`$SWEEP_RC\`)" + echo "- **Run:** $run_url" + echo "- **Trigger:** \`${GITHUB_EVENT_NAME}\`" + [ -n "$slack_line" ] && echo "$slack_line" + echo + # Summary content may contain attacker-controllable strings via + # IOC `detail` fields (e.g., lockfile lines, commit messages from + # scanned repos). Wrap in a fenced code block and strip characters + # that could escape the fence or inject markdown/HTML in the + # rendered issue body. findings.json (preserved in the uploaded + # artifact) is the canonical un-sanitized source for forensic + # review. + # + # Sanitization set MUST stay in sync with the Slack step below + # (search for the identical `tr -d` invocation). Both surfaces + # render attacker-influenced strings; asymmetric drift would + # leave one channel exploitable. Stripping `[]()` neutralizes + # GitHub-Flavored-Markdown link syntax (`[click](https://evil)`) + # which survives the `<>|*_`` strip alone. + echo '```' + tr -d '`<>|*_[]()' < "$SUMMARY_PATH" + echo '```' + # Persist the IOC stamp marker so the next IOC-grade run can + # compare against it. Emitted ONLY for rc=1 with a real stamp + # — operational-only runs (rc=2) MUST NOT emit stamps (they'd + # corrupt the IOC dedup chain). The stamp represents the + # dedup-decision input (the IOC set we observed this run), not + # the Slack-delivery outcome — emitting it here regardless of + # Slack's eventual success is correct: the next run's dedup + # gate needs to know what IOC set this run observed. + # + # The paged-at marker is intentionally NOT emitted here. A + # downstream "Persist Slack-paged marker" step (gated on + # `success() && rc==1 && should_page==true`) appends a separate + # comment containing only `` + # AFTER the Slack page step actually succeeds, so a failed + # Slack delivery doesn't poison the dedup chain by recording a + # paged-at timestamp for a page that never landed. The dedup + # reader (ioc-dedup step) scans the most-recent paged-at marker + # across ALL bot-authored comments, so splitting the markers + # across two comments composes correctly with no parser change. + # Stamp/dedup-reason values are workflow-generated (sha256 hex + # / decision tokens) — no attacker input — so they bypass the + # summary sanitizer safely. + if [ "$SWEEP_RC" = "1" ] && [ -n "${IOC_STAMP:-}" ]; then + echo + echo "" + echo "" + fi + } > /tmp/sweep-comment.md + + # Rolling-issue number resolved by the upstream find-rolling-issue + # step (oldest open issue with the rolling label wins, so a + # long-running incident issue stays the canonical thread even if + # someone files a duplicate later). A single canonical lookup + # eliminates the prior drift hazard between two independent + # `gh issue list` queries and closes a TOCTOU window where a + # human could close the rolling issue between the two queries. + existing="${FIND_ROLLING_ISSUE_NUM:-}" + + if [ -n "$existing" ]; then + echo "Appending to existing rolling issue #$existing" + gh issue comment "$existing" --body-file /tmp/sweep-comment.md + else + echo "Creating new rolling issue (label=$ROLLING_LABEL)" + # `gh label create --force` upserts the label (no-op if it exists + # with matching properties; resets color/description otherwise). + # Idempotency makes this safe to run on every cold-start of the + # rolling-issue cycle without a preflight `gh label list` check. + gh label create "$ROLLING_LABEL" \ + --description "Daily Shai-Hulud IOC sweep findings (DEVOP-560)" \ + --color B60205 \ + --force >/dev/null 2>&1 || true + title="[shai-hulud-sweep] rolling findings — $(date -u +%Y-%m)" + gh issue create \ + --label "$ROLLING_LABEL" \ + --title "$title" \ + --body-file /tmp/sweep-comment.md + fi + + - name: Page Slack on IOC findings + # Only IOC-grade findings (exit 1) page Slack. Operational findings + # (exit 2) only update the rolling issue — those are worth review but + # not incident-grade and shouldn't generate after-hours pages. + # Also gated on the ioc-dedup step's `should_page` decision: a + # standing IOC that's identical to the previous run only re-pages on + # the weekly cadence (WEEKLY_REPAGE_S in ioc-dedup), not daily. See + # the ioc-dedup step above for the full decision policy and the + # cubic/ce-code-review P1 finding (PRRT_kwDOLZ5Xss6Ee5gN) that + # motivated this gate. + # No-ops cleanly if SLACK_SECURITY_WEBHOOK is unset (early exit, not + # a failure) so an org without the secret provisioned doesn't get a + # red daily workflow. + if: always() && steps.sweep.outputs.rc == '1' && steps.ioc-dedup.outputs.should_page == 'true' + shell: bash + env: + SLACK_SECURITY_WEBHOOK: ${{ secrets.SLACK_SECURITY_WEBHOOK }} + SUMMARY_PATH: ${{ steps.sweep.outputs.summary_path }} + run: | + set -euo pipefail + if [ -z "${SLACK_SECURITY_WEBHOOK:-}" ]; then + echo "::warning::SLACK_SECURITY_WEBHOOK secret is not set; skipping Slack page." + exit 0 + fi + run_url="${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}" + # Read+sanitize the full summary, then let jq codepoint-slice to + # the Slack 3000-char-per-block limit below. An earlier version + # used `head -c 6000` to cap pre-sanitization, but byte-boundary + # truncation can split a multi-byte UTF-8 sequence (non-ASCII + # content can appear in IOC `detail` fields quoted from scanned + # repos). Under `set -euo pipefail`, a malformed UTF-8 byte + # passed via `jq --arg` aborts the step, silently dropping the + # IOC-grade Slack page on an unlucky boundary. summary.md is + # bounded by IOC count (one line per finding) so reading the + # whole file is safe in practice. + # + # Strip characters that could escape the Slack code fence or + # inject block-kit mrkdwn. IOC `detail` strings are attacker- + # influenceable; findings.json (preserved in the uploaded + # artifact) is the canonical un-sanitized source for forensic + # review. + # + # Sanitization set MUST stay in sync with the rolling-issue step + # above (search for the identical `tr -d` invocation). Both + # surfaces render attacker-influenced strings; asymmetric drift + # would leave one channel exploitable. Stripping `[]()` + # neutralizes Slack-mrkdwn / link-style sequences that survive + # the `<>|*_`` strip alone. + summary="$(tr -d '`<>|*_[]()' < "$SUMMARY_PATH" 2>/dev/null || echo 'summary unavailable')" + # jq builds the payload so embedded quotes/newlines/unicode in the + # summary can't break the JSON or inject Slack block-kit markup. + payload="$(jq -nc \ + --arg run "$run_url" \ + --arg org "$ORG" \ + --arg summary "$summary" \ + '{ + text: (":rotating_light: Shai-Hulud IOC findings — " + $org), + blocks: [ + { type: "header", + text: { type: "plain_text", text: "Shai-Hulud IOC sweep — incident-grade findings" } }, + { type: "section", + text: { type: "mrkdwn", + text: (":rotating_light: *Shai-Hulud IOC findings — " + $org + "*\n<" + $run + "|Open workflow run>") } }, + { type: "section", + text: { type: "mrkdwn", + text: ("```\n" + ($summary | .[0:2800]) + "\n```") } } + ] + }')" + # Retry on transient HTTP failures with exponential backoff (5s, + # 15s, 45s). Implemented in shell (not `curl --retry`) so we can + # honor Slack's Retry-After header on HTTP 429. + # Success: HTTP 200. Retry: 408/429/5xx + curl connect failures. + # Terminal: any other 4xx (configuration bug, bad webhook, etc.). + # + # `RETRY_AFTER_MAX_S` caps the server-supplied Retry-After value + # so a misbehaving proxy (or a compromised webhook URL returning + # `Retry-After: 86400`) cannot consume the job's 60-minute budget + # in a single sleep and silently drop the IOC-grade Slack page. + # 120s comfortably exceeds Slack's documented Retry-After values + # while staying well inside the job timeout headroom. + http_code=000 + attempt=0 + delays=(5 15 45) + # Derive max_attempts from the delays array so the two cannot + # drift — bumping retries by adding a delay value is enough. + # Previously, a maintainer tuning `max_attempts=4` without + # adding a fourth element to `delays` would index past the + # array and `sleep ""` (which errors under `set -euo pipefail` + # and drops the Slack page). + max_attempts=${#delays[@]} + RETRY_AFTER_MAX_S=120 + while [ "$attempt" -lt "$max_attempts" ]; do + attempt=$((attempt + 1)) + # `curl -w '%{http_code}'` writes the HTTP code (or `000` on + # transport failure: connect refused, DNS fail, TLS error, + # connect-timeout) to stdout AND exits non-zero on transport + # failure. The previous `|| echo 000` form appended an extra + # `000` to that stdout, producing the literal string `000000`, + # which falls through the `000|408|429|5*` case to terminal=0 + # and silently disables the curl-level retry path the loop + # exists for. `|| true` here keeps `set -euo pipefail` happy + # without corrupting the captured value; the `:= 000` default + # below pins the rare empty-output case to the documented + # transient-classification token. + http_code="$(curl -sS --connect-timeout 5 --max-time 15 \ + -X POST -H 'Content-Type: application/json' \ + --data "$payload" \ + -D /tmp/slack.headers \ + -o /tmp/slack.out \ + -w '%{http_code}' \ + "$SLACK_SECURITY_WEBHOOK" || true)" + : "${http_code:=000}" + if [ "$http_code" = "200" ]; then + break + fi + # Decide whether to retry. 408/429/5xx and curl-level failures + # (http_code=000) are transient; other 4xx are terminal. + case "$http_code" in + 000|408|429|5*) transient=1 ;; + *) transient=0 ;; + esac + if [ "$transient" -ne 1 ] || [ "$attempt" -ge "$max_attempts" ]; then + break + fi + sleep_for="${delays[$((attempt - 1))]}" + # If Slack sent Retry-After (429), prefer it when larger than our + # default backoff — being polite is cheap. Cap at + # RETRY_AFTER_MAX_S so a hostile/misconfigured upstream can't + # blow past the job timeout in a single sleep. + retry_after="$(awk 'BEGIN{IGNORECASE=1} /^Retry-After:/ {gsub(/[\r\n]/,"",$2); print $2; exit}' /tmp/slack.headers 2>/dev/null || true)" + if [ "$http_code" = "429" ] && [ -n "$retry_after" ] && [ "$retry_after" -gt "$sleep_for" ] 2>/dev/null; then + if [ "$retry_after" -gt "$RETRY_AFTER_MAX_S" ] 2>/dev/null; then + echo "::warning::Slack Retry-After=${retry_after}s exceeds cap ${RETRY_AFTER_MAX_S}s; clamping." + retry_after="$RETRY_AFTER_MAX_S" + fi + sleep_for="$retry_after" + fi + echo "::warning::Slack webhook attempt $attempt/$max_attempts returned HTTP $http_code; retrying in ${sleep_for}s." + sleep "$sleep_for" + done + if [ "$http_code" != "200" ]; then + echo "::error::Slack webhook returned HTTP $http_code after $attempt attempt(s): $(head -c 500 /tmp/slack.out 2>/dev/null || true)" + exit 1 + fi + echo "Slack page delivered (HTTP $http_code, attempts=$attempt)." + + - name: Persist Slack-paged marker + # Append a one-line bot comment recording the moment Slack + # actually delivered the IOC page. Gated on `success()` of the + # upstream `Page Slack on IOC findings` step (and `rc==1 + + # should_page=='true'` for defense-in-depth), so a failed Slack + # delivery never writes a paged-at marker. Previously this marker + # was emitted inside the rolling-issue update step (before Slack + # ran), meaning a failed Slack send would still record a paged-at + # timestamp and silently corrupt the dedup chain for up to 7 days + # — the dedup gate would believe Slack had paged, suppress the + # next IOC-grade run's page, and the standing IOC would silently + # stop alerting until the weekly re-page window expired. + # + # The marker is in a SEPARATE comment from the IOC-stamp comment + # written by the update step above. The ioc-dedup reader scans + # the most-recent `` across ALL + # bot-authored comments on the rolling issue (sorted by + # created_at desc), so splitting the markers across two comments + # composes correctly with no parser change. + # + # `success() && ...` — `success()` is critical here: a default + # `if: always() && ...` would fire even when the Slack page step + # FAILED, reintroducing the very bug this gate exists to prevent. + if: success() && steps.sweep.outputs.rc == '1' && steps.ioc-dedup.outputs.should_page == 'true' + shell: bash + env: + FIND_ROLLING_ISSUE_NUM: ${{ steps.find-rolling-issue.outputs.issue_num }} + # `issues:write` only — same scope as the rolling-issue update + # step. The marker comment is workflow-generated content (a + # single hidden HTML marker with a workflow-supplied ISO8601 + # timestamp), no attacker input. + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + set -euo pipefail + issue_num="${FIND_ROLLING_ISSUE_NUM:-}" + if [ -z "$issue_num" ]; then + # The update step above either appended to an existing issue + # or created a new one. If find-rolling-issue saw nothing AND + # the update step created a fresh issue (because $existing was + # empty), we need to look the new issue up here. Otherwise + # the paged-at marker would silently land nowhere and the + # dedup chain would believe Slack never paged. + issue_num="$(gh issue list \ + --search "label:\"$ROLLING_LABEL\" state:open sort:created-asc" \ + --limit 1 \ + --json number \ + --jq '.[0].number // empty' 2>/dev/null || true)" + fi + if [ -z "$issue_num" ]; then + echo "::warning::No rolling issue resolvable for paged-at marker; the next IOC-grade run will treat this as 'paged-at-missing' and conservatively page (fail-OPEN). Investigate why the update step neither found nor created a rolling issue." + exit 0 + fi + paged_at="$(date -u +%Y-%m-%dT%H:%M:%SZ)" + # The marker MUST be the only meaningful content of the comment + # — the dedup scanner regex-greps for the marker in comment + # bodies, so additional surrounding text is fine but kept + # minimal here so human readers can see at a glance what the + # comment records. + { + echo "Slack page delivered at \`${paged_at}\`." + echo + echo "" + } > /tmp/sweep-paged-at-comment.md + gh issue comment "$issue_num" --body-file /tmp/sweep-paged-at-comment.md + echo "::notice::Recorded paged-at=${paged_at} on rolling issue #${issue_num}." + + - name: Slack page suppressed by IOC dedup + # IOC-grade run that matched the previous run's stamp and hasn't + # aged past the weekly re-page window. Surface a workflow notice + # so the run page makes the dedup decision obvious to anyone + # checking why Slack didn't fire — the rolling issue still got + # appended (above) for triage continuity. + if: always() && steps.sweep.outputs.rc == '1' && steps.ioc-dedup.outputs.should_page != 'true' + shell: bash + env: + DECISION_REASON: ${{ steps.ioc-dedup.outputs.decision_reason }} + PREV_PAGED_AT: ${{ steps.ioc-dedup.outputs.prev_paged_at }} + run: | + echo "::notice::IOC-grade run; Slack page suppressed by dedup (reason=${DECISION_REASON:-dedup-skip}, prev_paged_at=${PREV_PAGED_AT:-}). Rolling issue updated; weekly re-page will fire if the standing IOC persists." + + - name: Final run summary + if: always() + shell: bash + env: + SWEEP_RC: ${{ steps.sweep.outputs.rc }} + SHOULD_PAGE: ${{ steps.ioc-dedup.outputs.should_page }} + DECISION_REASON: ${{ steps.ioc-dedup.outputs.decision_reason }} + run: | + # SHOULD_PAGE is a tri-state output of the ioc-dedup step: + # true → Slack page step fired this run + # false → Slack page step suppressed by dedup gate this run + # '' → ioc-dedup step did not produce outputs (e.g., the + # step crashed under set -euo pipefail before writing + # to $GITHUB_OUTPUT). The Slack page step's strict + # `should_page == 'true'` gate then suppressed Slack; + # the suppressed-by-dedup notice step's `!= 'true'` + # gate fired ALSO. Surface this third state explicitly + # rather than defaulting to "paged" — defaulting hid a + # three-way contradiction between the Slack gate, the + # suppression-notice gate, and this summary. + case "${SWEEP_RC:-x}" in + 0) echo "::notice::Sweep clean (no findings)." ;; + 1) + case "${SHOULD_PAGE:-}" in + true) + echo "::warning::Sweep produced IOC-grade findings; Slack paged (reason=${DECISION_REASON:-unknown}). See rolling issue + Slack." + ;; + false) + echo "::warning::Sweep produced IOC-grade findings; Slack page deduped (reason=${DECISION_REASON:-dedup-skip}). See rolling issue; weekly re-page will fire if standing IOC persists." + ;; + *) + echo "::error::Sweep produced IOC-grade findings; Slack decision unavailable (ioc-dedup step did not produce outputs — likely crashed before writing GITHUB_OUTPUT). Investigate the ioc-dedup step in this run." + ;; + esac + ;; + 2) echo "::warning::Sweep produced operational findings only. See rolling issue." ;; + *) echo "::error::Sweep failed to produce a meaningful exit code (rc='${SWEEP_RC:-unset}')." ;; + esac diff --git a/docs/plans/2026-05-25-devop-560-shai-hulud-sweep.md b/docs/plans/2026-05-25-devop-560-shai-hulud-sweep.md new file mode 100644 index 0000000..0d6842e --- /dev/null +++ b/docs/plans/2026-05-25-devop-560-shai-hulud-sweep.md @@ -0,0 +1,117 @@ +# DEVOP-560 — Daily Shai-Hulud IOC sweep workflow + +Linear: + +## Decisions + +- **Script location: vendored, not cloned.** The canonical script lives in + `allora-network/skills` (PR #69, currently open), which is a **private repo**. + The org `.github` workflow runs under the default `GITHUB_TOKEN` whose scope is + bounded to this repo, so cross-repo private clones would require provisioning + an extra deploy token / GitHub App. Self-containment also keeps the daily + sweep working if the skills repo is ever rotated, renamed, or temporarily + unavailable. Vendor a verbatim copy at `scripts/shai-hulud-ioc-sweep.sh`, + with a header pointer to the canonical upstream path + commit SHA and a + refresh procedure for keeping it in sync. +- **IOC inputs:** read `.github/security/ioc-packages.txt` and + `.github/security/ioc-hashes.txt` from the workflow checkout (merged in PR #2 + via DEVOP-561). The script validates the `# schema:v1` header before running. +- **Rolling issue:** find the open issue labelled `shai-hulud-sweep` in this + repo. If new findings exist and an open issue is present, append a comment + with the run summary; if no open issue exists, open one with that label. The + workflow never auto-closes; humans drive close-and-reopen so triage state is + preserved across runs. +- **Slack alert path:** post the run summary to `SLACK_SECURITY_WEBHOOK` only + when the run produces **new** IOC findings — defined as `rc == 1` AND + (no prior IOC-grade rolling-issue comment exists, OR today's IOC stamp + differs from the previous one, OR ≥ `WEEKLY_REPAGE_S` (7 days) have + elapsed since the last Slack page). Operational findings (clone_failed / + check_skipped / go_local_replace, exit 2) update the rolling issue but + do not page Slack. The dedup stamp is `sha256` of the sorted + `{repo, rule, path, detail}` TSV of IOC-grade rows from `findings.json` + (`ts` is intentionally excluded so an identical IOC set produces an + identical stamp across daily runs). Previous-run state is recovered + from hidden HTML markers embedded in the rolling-issue comment: + `` (always on IOC-grade comments) and + `` (only on comments where Slack was + actually paged, so a deduped run preserves the older real timestamp + and the weekly re-page window stays honest). Do NOT regress this to a + bare `if: rc == '1'` Slack gate — that's the alert-fatigue regression + surfaced by cubic (`PRRT_kwDOLZ5Xss6Ee5gN`) and corroborated by four + ce-code-review reviewers (anchor 100). A standing unresolved IOC pages + daily under bare gating and conditions responders to mute the channel. + Both the dedup gate AND the weekly re-page are required: bare dedup + without re-page lets a forgotten standing IOC silently age out forever. + IOC_RULES_RE in the workflow's dedup step MUST stay in sync with + `scripts/shai-hulud-ioc-sweep.sh` (search for `IOC_RULES_RE`) — drift + would either mis-dedup a real new IOC or re-page on operational-only + changes that didn't bump the stamp. +- **Schedule:** `cron: '7 4 * * *'` (04:07 UTC, off-peak + off-minute), plus + `workflow_dispatch` for manual / debugging runs. +- **Permissions:** `contents: read` + `issues: write`. No other scopes. +- **Member exfil search:** the default `GITHUB_TOKEN` does not carry `read:org`, + so member enumeration will emit `check_skipped` operational findings. Wire a + `GH_ORG_READ_TOKEN` secret in a follow-up if/when org-admin signs off — the + workflow already prefers it when present. + +## Third-party action SHA rotation + +Both third-party actions used by `.github/workflows/shai-hulud-sweep.yml` are +pinned to immutable commit SHAs (not floating tags). Pinning to a SHA is the +hard requirement; **rotation is the maintenance burden that comes with it.** + +| Action | Current pin | Tag at pin | Released | +| --- | --- | --- | --- | +| `actions/checkout` | `11bd71901bbe5b1630ceea73d27597364c9af683` | `v4.2.2` | 2024-10 | +| `actions/upload-artifact` | `b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882` | `v4.4.3` | 2024-10 | + +- **Owner:** `@allora-network/devops`. The workflow file is co-owned by + `@allora-network/security` (per `.github/CODEOWNERS`), so a SHA bump still + goes through security review — devops drives the cadence, security signs + off. +- **Cadence:** quarterly review (Jan / Apr / Jul / Oct) **plus** an immediate + rotation on any CVE alert affecting either action. The quarterly cadence is + cheap (two SHAs, ~10 minutes per cycle) and keeps us from drifting more + than ~3 months behind upstream security fixes. +- **Canonical source for the latest release SHA:** + - `actions/checkout`: → + pick the latest `vN.N.N` release → expand "Assets" → copy the **commit + SHA** from the release tag's commit (NOT the release-tag's own object + SHA, which is a tag object). + - `actions/upload-artifact`: + → same procedure. + - Verify a candidate SHA against the action's signed releases tab before + bumping. Never pin off `main` or a branch tip. +- **Rotation procedure:** + 1. Update the `uses: actions/...@` line in + `.github/workflows/shai-hulud-sweep.yml`. + 2. Update the inline `# SHA pin: ... vX.Y.Z (YYYY-MM)` comment to match. + 3. Update this table. + 4. Open a PR — security CODEOWNER will be auto-requested by virtue of the + workflow path rule. +- **Automation follow-up:** add `.github/dependabot.yml` with a + `github-actions` package-ecosystem entry so Dependabot opens a PR with the + new SHA on each release. This is additive and out of scope for the initial + workflow ship — track separately. When added, the quarterly manual cadence + collapses into "review the Dependabot PR within the same calendar quarter". + +## Follow-ups (org-admin / out-of-scope for this PR) + +- **Branch protection: require CODEOWNERS review on `main`.** The in-repo + `.github/CODEOWNERS` rule is wired (DEVOP-560 Finding A), but it only + enforces auto-requested reviewers — the actual blocking gate ("Require + review from Code Owners") lives in branch protection / rulesets and is + org-admin territory. Open as a separate issue once this PR is merged. +- **Missed-run / daily-cron observability (DEVOP-560 Finding F).** GitHub + Actions silently auto-disables scheduled workflows after 60 days of repo + inactivity, and there is no native "the daily cron didn't fire" signal. + The current workflow relies on the rolling issue being updated daily; if a + run is silently skipped, that signal is absent rather than negative. Out + of scope for this PR because the fix is materially additive — either a + second watchdog workflow (different repo, hourly, that pings the API for + this workflow's last successful run timestamp and pages Slack if > 26h + old) or a healthchecks.io / Better Stack heartbeat URL hit at the end of + every successful run. Track separately. +- **Dependabot for `github-actions`.** See SHA-rotation section above. Adds + `.github/dependabot.yml` with a `github-actions` ecosystem entry. Additive + + low-risk; can ship alongside or after this PR. diff --git a/scripts/shai-hulud-ioc-sweep.sh b/scripts/shai-hulud-ioc-sweep.sh new file mode 100755 index 0000000..d852627 --- /dev/null +++ b/scripts/shai-hulud-ioc-sweep.sh @@ -0,0 +1,595 @@ +#!/usr/bin/env bash +# +# shai-hulud-ioc-sweep.sh — scan a GitHub org for Shai-Hulud indicators of +# compromise. Consumed by .github/workflows/shai-hulud-sweep.yml. +# +# === Vendoring note === +# This is a verbatim copy of the canonical script at: +# allora-network/skills @ skills/shai-hulud-defense/scripts/shai-hulud-ioc-sweep.sh +# pinned commit: 71aeefb422b2dd0d41118277b3aa122345190c7b (PR #69, open) +# +# We vendor instead of cloning the skills repo at workflow time because +# allora-network/skills is private and the default GITHUB_TOKEN cannot read it; +# vendoring also makes the daily sweep robust to upstream rename / outage. +# Refresh procedure: when the upstream script changes (skills repo PR merged +# or follow-up commits land), copy the file back into this path, update the +# pinned commit above, and bump the schema-version header in this script's +# error path if the IOC file format also changed. +# +# Read-only. Exit codes: +# 0 clean — no findings at all +# 1 IOC finding(s) — INVOKE INCIDENT RESPONSE +# 2 operational issues only (clone_failed / check_skipped / go_local_replace) +# +# Usage: +# shai-hulud-ioc-sweep.sh [ioc-packages.txt] [ioc-hashes.txt] +# +# Env: +# OUTPUT_DIR — defaults to ./.shai-hulud-sweep/ +# GO_TRUSTED_HOSTS_RE — extended regex of trusted module-path prefixes for +# Go `replace` RHS. Defaults to allora-network's +# allowlist; override per org to silence false +# `go_suspicious_replace` findings. +# GO_REPLACE_ALLOWED_FILE — optional path to a list of explicit +# `LHS_top_level => RHS_top_level` aliases that are +# allowed despite top-level path mismatch (one per +# line, `#` comments allowed). Defends Scenario C +# in-org redirect attacks. +# +# Requires: gh (authenticated, non-SSH), jq, sha256sum (or shasum -a 256), +# git, find, awk. + +set -euo pipefail + +ORG="${1:-${GITHUB_ORG:-}}" +PACKAGES_FILE="${2:-./ioc-packages.txt}" +HASHES_FILE="${3:-./ioc-hashes.txt}" +OUTPUT_DIR="${OUTPUT_DIR:-./.shai-hulud-sweep/$(date -u +%Y%m%d-%H%M%S)}" + +# Default trust allowlist matches REFERENCE.md's hardened CI workflow snippet. +# Other orgs override via GO_TRUSTED_HOSTS_RE so they don't get false +# `go_suspicious_replace` findings for their own GitHub mirrors. +GO_TRUSTED_HOSTS="${GO_TRUSTED_HOSTS_RE:-github\.com/(allora-network|cometbft|cosmos|ethereum|fluxcd)|gopkg\.in|google\.golang\.org|go\.uber\.org|k8s\.io|sigs\.k8s\.io|go\.opentelemetry\.io}" +GO_REPLACE_ALLOWED_FILE="${GO_REPLACE_ALLOWED_FILE:-}" + +[ -n "$ORG" ] || { echo "usage: $0 [packages.txt] [hashes.txt]" >&2; exit 2; } +[ -f "$PACKAGES_FILE" ] || { echo "missing IOC packages file: $PACKAGES_FILE" >&2; exit 2; } +[ -f "$HASHES_FILE" ] || { echo "missing IOC hashes file: $HASHES_FILE" >&2; exit 2; } +command -v gh >/dev/null || { echo "gh required" >&2; exit 2; } +command -v jq >/dev/null || { echo "jq required" >&2; exit 2; } + +# Schema-version assertion — a silent schema break in the sibling .github repo +# (e.g. dropping the `ecosystem:` prefix) would otherwise corrupt parsing and +# produce a false-clean sweep. Bump in lockstep when the seed-list format +# changes. Both seed files (packages + hashes) carry the header so a future +# reformat of either side fails loud instead of silently zero-matching against +# the whole org. +if ! head -n1 "$PACKAGES_FILE" | grep -qE '^#[[:space:]]*schema:v1'; then + echo "IOC packages file $PACKAGES_FILE missing '# schema:v1' header — refusing to run (would silently false-clean on parser drift)." >&2 + exit 2 +fi +if ! head -n1 "$HASHES_FILE" | grep -qE '^#[[:space:]]*schema:v1'; then + echo "IOC hashes file $HASHES_FILE missing '# schema:v1' header — refusing to run (would silently false-clean on parser drift)." >&2 + exit 2 +fi + +# Auth assertion — refuse to run unauthenticated; private repos would silently +# clone_failed and the sweep would mislabel a partial scan as org-wide. +if ! gh auth token >/dev/null 2>&1; then + echo "gh is not authenticated — run 'gh auth login' first. Sweep would silently skip every private repo." >&2 + exit 2 +fi +# Token capture removed from execution after PRRT_kwDOQ91i5M6EVwmd / cubic#218: +# we now delegate to `gh auth git-credential` inside the per-clone credential +# helper, which keeps the token out of git's argv entirely. Restore this line +# only if reverting to an inlined-token credential helper, AND first fix the +# argv-exposure issue (e.g. single-quote the helper body and `export +# GH_TOKEN_VALUE` so it's expanded by the helper's subshell at credential +# time, not by the parent shell at git-launch time). +# GH_TOKEN_VALUE="$(gh auth token 2>/dev/null)" + +# Warn loudly if operator has SSH-default git_protocol set — historically a +# common silent failure mode; the credential-helper override below mitigates it +# but the warning makes the precondition explicit. +if [ "$(gh config get git_protocol 2>/dev/null || true)" = "ssh" ]; then + echo "WARN: gh git_protocol=ssh detected. Sweep injects the gh OAuth token via credential helper so private repos still clone, but interactive operators may see unexpected SSH-key prompts disabled. Continuing." >&2 +fi + +SHA256_CMD="$(command -v sha256sum || true)" +[ -n "$SHA256_CMD" ] || SHA256_CMD="shasum -a 256" + +mkdir -p "$OUTPUT_DIR" +FINDINGS="$OUTPUT_DIR/findings.json" +FINDINGS_NDJSON="$OUTPUT_DIR/findings.ndjson" +SUMMARY="$OUTPUT_DIR/summary.md" +EVIDENCE_DIR="$OUTPUT_DIR/evidence" +: > "$FINDINGS_NDJSON" +mkdir -p "$EVIDENCE_DIR" +: > "$OUTPUT_DIR/.dirty-repos" + +# IOC rule names that count toward the IR-incident exit code (1). All other +# findings are operational (clone_failed / check_skipped / go_local_replace) +# and only escalate to exit code 2, never the IR banner. +IOC_RULES_RE='^(ioc_package_match|ioc_bundle_hash|persistence_workflow|suspicious_lifecycle_script|public_exfil_repo|public_exfil_repo_member|go_suspicious_replace|go_replace_path_mismatch|go_unsafe_env|go_unsafe_env_indirect)$' + +# Per-repo IOC-finding presence is tracked in a plain file (one repo per line, +# deduped on lookup) instead of a bash assoc array. macOS ships /bin/bash 3.2 +# which lacks `declare -A`; using a file keeps the script portable to stock +# macOS Homebrew/CI runners. +DIRTY_REPOS_FILE="$OUTPUT_DIR/.dirty-repos" + +log() { printf '[%s] %s\n' "$(date -u +%H:%M:%S)" "$*" >&2; } +# finding() appends one NDJSON object per call. O(1) per write, atomic via +# POSIX O_APPEND for line-bounded writes, and safe for any future xargs -P +# parallelization of the per-repo loop. Aggregated into a JSON array at the +# end of the run. +finding(){ + local repo="$1" rule="$2" path="$3" detail="$4" + jq -nc --arg r "$repo" --arg ru "$rule" --arg p "$path" --arg d "$detail" \ + '{repo:$r, rule:$ru, path:$p, detail:$d, ts: now}' \ + >> "$FINDINGS_NDJSON" + # Track which repos produced any finding so we can preserve their working + # tree as forensic evidence (see end-of-loop cleanup). Operational-only + # findings (clone_failed, check_skipped, go_local_replace) don't have a + # clone to preserve in the first place. + case "$rule" in + clone_failed|check_skipped|go_local_replace) ;; + *) printf '%s\n' "$repo" >> "$DIRTY_REPOS_FILE" ;; + esac +} + +sort "$PACKAGES_FILE" -o "$OUTPUT_DIR/packages.sorted" +sort "$HASHES_FILE" -o "$OUTPUT_DIR/hashes.sorted" + +# Build per-ecosystem needle lists once. The previous single-substring grep +# missed most lockfile formats (pip/poetry/Pipfile/go.sum/modern +# package-lock.json) because each ecosystem encodes dependency identity +# differently. PEP-503-normalize pip names. Emit multiple needle shapes per +# IOC so we catch both yarn/pnpm `name@version` patterns and modern +# package-lock.json `"version": "x.y.z"` entries scoped to the right package +# key. +: > "$OUTPUT_DIR/needles.npm" +: > "$OUTPUT_DIR/needles.pip" +: > "$OUTPUT_DIR/needles.go" +awk -F: ' + /^[[:space:]]*#/ { next } + /^[[:space:]]*$/ { next } + NF < 2 { next } + { + eco = $1 + sub("^" eco ":", "", $0) + pkg = $0 + sub(/[[:space:]]+$/, "", pkg) + # Split name@version, allowing @ inside module path (Go). + n = 0 + for (i = length(pkg); i >= 1; i--) { + if (substr(pkg, i, 1) == "@") { n = i; break } + } + if (n == 0) next + name = substr(pkg, 1, n - 1) + ver = substr(pkg, n + 1) + if (name == "" || ver == "") next + if (eco == "npm") { + print name "@" ver >> "'"$OUTPUT_DIR"'/needles.npm" + # Bare "name" needle is intentionally NOT emitted — it produced false + # positives on any quoted occurrence of the package name at any + # version (see PRRT_kwDOQ91i5M6EVwmh / cubic#145). Modern npm v2/v3 + # package-lock.json coverage is provided by a structured jq pass in + # the lockfile loop (`if [ "$bn" = "package-lock.json" ]`), which + # walks `.packages[].version` (v2/v3) and the recursive + # `.dependencies` tree (v1) and exact-matches projected name@version + # tuples against this same needle file. That structured pass closes + # the false-clean gap for lockfiles written without `resolved` URLs + # (workspaces, private-registry overrides, npm omit-resolved + # configs) without reintroducing the false-positive risk. + print name "-" ver ".tgz" >> "'"$OUTPUT_DIR"'/needles.npm" + } else if (eco == "pip") { + lname = tolower(name) + gsub(/[._]+/, "-", lname) + print lname "==" ver >> "'"$OUTPUT_DIR"'/needles.pip" + } else if (eco == "go") { + print name " " ver >> "'"$OUTPUT_DIR"'/needles.go" + } + } +' "$OUTPUT_DIR/packages.sorted" + +# Canonical exclude args for every find walk in the per-repo loop. Without +# these, lockfiles/JS bundles/go.mod files committed under node_modules/ +# vendor/.git balloon scan time by orders of magnitude on monorepos AND +# produce false positives whose path field points at vendored transitive +# dependencies (misleads triage). +EXCLUDE_FIND=( -not -path '*/node_modules/*' -not -path '*/vendor/*' -not -path '*/.git/*' ) + +# Map lockfile basename → ecosystem so each lockfile is grepped only against +# its own ecosystem's IOC needles (a pip IOC against a yarn.lock is noise). +lockfile_eco() { + case "$1" in + package-lock.json|pnpm-lock.yaml|yarn.lock|bun.lock) echo npm ;; + requirements.txt|requirements.lock|poetry.lock|Pipfile.lock) echo pip ;; + go.sum) echo go ;; + *) echo unknown ;; + esac +} + +log "Listing repos in $ORG..." +# Paginate the full repo list — fixed --limit silently truncates large orgs. +# `gh api ... --paginate` walks Link headers; jq filters to non-empty default branches. +gh api -H "Accept: application/vnd.github+json" --paginate \ + "/orgs/$ORG/repos?per_page=100&type=all" \ + --jq '.[] | select(.default_branch != null and .default_branch != "") | .name' \ + > "$OUTPUT_DIR/repos.txt" +REPO_COUNT=$(wc -l < "$OUTPUT_DIR/repos.txt" | tr -d ' ') +log "Found $REPO_COUNT repos (full pagination). Sweeping..." + +# Build the `go_replace` aliased-alias allowlist as a normalized lookup file +# (one `LHS_topRHS_top` tuple per line). File-based instead of assoc +# array so stock macOS /bin/bash 3.2 still works. Each input line is: +# github.com/legit/x github.com/legit-mirror/x +GO_REPLACE_ALLOW_NORM="$OUTPUT_DIR/.go-replace-allow" +: > "$GO_REPLACE_ALLOW_NORM" +if [ -n "$GO_REPLACE_ALLOWED_FILE" ] && [ -f "$GO_REPLACE_ALLOWED_FILE" ]; then + while IFS= read -r line; do + case "$line" in ''|'#'*) continue ;; esac + lhs_top="${line%% *}" + rhs_top="${line##* }" + if [ -n "$lhs_top" ] && [ -n "$rhs_top" ]; then + printf '%s\t%s\n' "$lhs_top" "$rhs_top" >> "$GO_REPLACE_ALLOW_NORM" + fi + done < "$GO_REPLACE_ALLOWED_FILE" +fi + +# Clones directory is purely scratch — preserved evidence lives in +# $EVIDENCE_DIR. Even on unclean exits (signal, set -e abort, transient +# clone failure), drop the scratch tree so disk doesn't fill on long runs. +trap 'rm -rf -- "$OUTPUT_DIR/clones" 2>/dev/null || true' EXIT + +while IFS= read -r repo; do + log " $repo" + WORK="$OUTPUT_DIR/clones/$repo" + mkdir -p "$(dirname "$WORK")" + # Use the gh OAuth token via a per-clone credential helper so private-repo + # access does not depend on whatever credential helper / SSH key happens to + # be configured on the operator's workstation. Previously, an operator with + # `gh config get git_protocol = ssh` could silently `clone_failed` every + # private repo, producing a partial sweep mislabeled as org-wide. Plain + # `gh repo clone` likewise honors the operator's git_protocol setting. + # + # Delegate to `gh auth git-credential` instead of inlining the token in a + # shell-expanded helper body: the previous form embedded $GH_TOKEN_VALUE in + # git's argv (visible via `ps`/`/proc//cmdline`), and gh's built-in + # credential helper resolves the token from gh's own auth state at fetch + # time without putting it on any command line. (Auth presence is asserted + # above via `gh auth token`, so this helper always has a credential to + # return.) See PRRT_kwDOQ91i5M6EVwmd / cubic#218. + if ! git -c "credential.helper=" \ + -c "credential.helper=!gh auth git-credential" \ + clone --depth 1 --no-tags --quiet \ + "https://github.com/$ORG/$repo.git" "$WORK" 2>/dev/null; then + finding "$repo" "clone_failed" "" "git clone (gh-token credential helper) failed (network/empty repo/permission)" + continue + fi + + # Single tree walk per repo collects every file we care about (lockfiles, + # JS files for hash IOCs, package.json, go.mod, workflow YAMLs). One find + # invocation replaces 9+ per-target walks and the per-IOC inner grep loop + # is replaced by a single `grep -F -f needles` per lockfile. + LOCKS_FILE="$OUTPUT_DIR/.scan/$repo.locks" + JS_FILE="$OUTPUT_DIR/.scan/$repo.js" + PKG_FILE="$OUTPUT_DIR/.scan/$repo.pkg" + GOMOD_FILE="$OUTPUT_DIR/.scan/$repo.gomod" + WF_FILE="$OUTPUT_DIR/.scan/$repo.wf" + PERSIST_FILE="$OUTPUT_DIR/.scan/$repo.persist" + mkdir -p "$(dirname "$LOCKS_FILE")" + : > "$LOCKS_FILE"; : > "$JS_FILE"; : > "$PKG_FILE"; : > "$GOMOD_FILE"; : > "$WF_FILE"; : > "$PERSIST_FILE" + + while IFS= read -r path; do + bn="${path##*/}" + case "$bn" in + package-lock.json|pnpm-lock.yaml|yarn.lock|bun.lock|requirements.txt|requirements.lock|poetry.lock|Pipfile.lock|go.sum) + printf '%s\n' "$path" >> "$LOCKS_FILE" ;; + package.json) + printf '%s\n' "$path" >> "$PKG_FILE" ;; + go.mod) + printf '%s\n' "$path" >> "$GOMOD_FILE" ;; + esac + case "$bn" in + *.js|*.cjs|*.mjs) + # Cap at 2MB so a hostile/large JS file can't stall sha256sum on the + # whole org sweep; bundlers rarely emit >2MB single files. + if [ "$(stat -f%z "$path" 2>/dev/null || stat -c%s "$path" 2>/dev/null || echo 0)" -le 2097152 ]; then + printf '%s\n' "$path" >> "$JS_FILE" + fi + ;; + esac + # GitHub Actions only executes workflows under $REPO_ROOT/.github/workflows/. + # Anchor on $WORK/.github/workflows/ so nested .github copies (vendored + # examples, test fixtures, monorepo subpackage scaffolding) cannot produce + # false `persistence_workflow` (rule 3) or `go_unsafe_env` / + # `go_unsafe_env_indirect` (rule 6) hits — those rules consume $WF_FILE / + # $PERSIST_FILE populated here. See PRRT_kwDOQ91i5M6EVwmg / cubic#258. + # + # The persistence_workflow basename match is INTENTIONALLY narrowed to the + # exact filenames known to be dropped by the Shai-Hulud worm + # (`shai-hulud.yml`, `shai-hulud.yaml`, `shai-hulud-workflow.yml`, + # `shai-hulud-workflow.yaml`). A broader `shai-hulud*` glob would self- + # detect the legitimate defense workflow this script is invoked from + # (`.github/workflows/shai-hulud-sweep.yml` in this very repo) on every + # daily sweep, producing a guaranteed false IOC page that conditions + # responders to mute the channel — textbook alert fatigue. Keep this glob + # explicit; if a new worm variant ships a new filename, add it here + # rather than reverting to a wildcard. + case "$path" in + "$WORK/.github/workflows/"*) + case "$bn" in + *.yml|*.yaml) + printf '%s\n' "$path" >> "$WF_FILE" + case "$bn" in + shai-hulud.yml|shai-hulud.yaml|shai-hulud-workflow.yml|shai-hulud-workflow.yaml) + printf '%s\n' "$path" >> "$PERSIST_FILE" ;; + esac + ;; + esac + ;; + esac + done < <(find "$WORK" -type f "${EXCLUDE_FIND[@]}" 2>/dev/null) + + # 1. Lockfile IOC scan (npm/pip/Go), per-ecosystem matchers. + # The previous `grep -qF "$nameversion"` against every lockfile silently + # missed pip/poetry/Pipfile/go.sum and modern package-lock.json formats — + # the daily-IOC workflow trusted a `Clean.` summary that was structurally + # incapable of finding hits in those ecosystems. + while IFS= read -r lockpath; do + lockbn="${lockpath##*/}" + eco="$(lockfile_eco "$lockbn")" + [ "$eco" = "unknown" ] && continue + needles="$OUTPUT_DIR/needles.$eco" + [ -s "$needles" ] || continue + # Structured pass for npm package-lock.json. Modern v2/v3 lockfiles + # encode dependency identity in `.packages["node_modules/"].version` + # rather than embedding `name@version` or `name-version.tgz` substrings. + # Lockfiles produced without `resolved` URLs (workspaces, + # private-registry overrides, npm omit-resolved configs) carry NO + # matchable substring at all, so the grep-only path below silently + # false-cleans even when a compromised IOC version is installed — + # exactly the signal the daily-IOC workflow trusts. Project + # name@version structurally via jq and exact-line match against the + # npm needles. The v1 fallback walks the recursive `.dependencies` + # tree (older lockfiles also lack `resolved` URLs in some configs). + # See PRRT_kwDOQ91i5M6EcQEH / cubic#159. + if [ "$lockbn" = "package-lock.json" ]; then + while IFS= read -r needle; do + finding "$repo" "ioc_package_match" "${lockpath#"$WORK"/}" "npm:$needle" + done < <( + { + jq -r ' + (.packages // {}) + | to_entries[]? + | select(.key != "" and (.value.version // "") != "") + | (.key | sub("^.*node_modules/"; "")) + "@" + .value.version + ' "$lockpath" 2>/dev/null || true + jq -r ' + def walk: to_entries[]? | (.key + "@" + (.value.version // "")), (.value.dependencies // {} | walk); + (.dependencies // {}) | walk + ' "$lockpath" 2>/dev/null || true + } | grep -Fxf "$needles" 2>/dev/null | sort -u + ) + fi + # Substring grep — covers yarn.lock / pnpm-lock.yaml / bun.lock / + # pip / Go lockfiles plus npm lockfiles that DO embed + # `name-version.tgz` in resolved URLs. Runs on package-lock.json too + # as a defense-in-depth signal alongside the structured pass above. + while IFS= read -r needle; do + finding "$repo" "ioc_package_match" "${lockpath#"$WORK"/}" "$eco:$needle" + done < <(grep -F -f "$needles" "$lockpath" 2>/dev/null | sort -u) + done < "$LOCKS_FILE" + + # 2. JS hash scan. Hash IOCs are content-keyed by nature, so filename- + # pinning to `bundle.js` would let any worm variant trivially bypass the + # layer by renaming the dropper. Scan every .js/.cjs/.mjs file under 2MB + # (collected above with exclusions) and match against the hash IOC list. + while IFS= read -r path; do + hash="$($SHA256_CMD "$path" | awk '{print $1}')" + grep -qFx "$hash" "$OUTPUT_DIR/hashes.sorted" \ + && finding "$repo" "ioc_bundle_hash" "${path#"$WORK"/}" "$hash" + done < "$JS_FILE" + + # 3. Persistence workflow file — only the repo-root .github/workflows/ is + # executed by GitHub Actions, so $PERSIST_FILE is already scoped to that + # path by the case statement above. Nested .github/workflows directories + # (vendored examples, test fixtures) would only produce false IOC hits. + while IFS= read -r f; do + finding "$repo" "persistence_workflow" "${f#"$WORK"/}" "shai-hulud workflow file present" + done < "$PERSIST_FILE" + + # 4. Suspicious lifecycle script patterns (npm). Broader regex covers + # `node ./bundle.js`, `node dist/bundle.js`, npx, `eval $(curl ...)`, + # `base64 --decode` / `-D`, and `| bash` pipes — the narrow original + # regex missed common Shai-Hulud variant patterns. + while IFS= read -r pkgjson; do + jq -e '.scripts // {} | to_entries[] | select(.key | test("install|postinstall|preinstall")) | .value | test("node[[:space:]]+\\S*bundle\\.js|curl[[:space:]].*\\|[[:space:]]?(ba)?sh|wget[[:space:]].*\\|[[:space:]]?(ba)?sh|base64[[:space:]]+(-d|--decode|-D)|eval[[:space:]]+\\$\\(|npx[[:space:]]+.*bundle")' \ + "$pkgjson" >/dev/null 2>&1 \ + && finding "$repo" "suspicious_lifecycle_script" "${pkgjson#"$WORK"/}" "matches Shai-Hulud postinstall pattern" + done < "$PKG_FILE" + + # 5. Go: replace directives pointing outside trusted hosts, AND a parallel + # path-equality rule that catches the Scenario C in-org compromise where + # an attacker swaps `replace github.com/allora/legit => github.com/allora/ + # attacker-fork` (which the host allowlist alone passes through). + while IFS= read -r gomod; do + # Read awk output via process substitution so the inner loop runs in + # the parent shell — finding() writes to DIRTY_REPOS (an assoc array) + # and a piped subshell would drop those writes, breaking forensic + # evidence preservation for go-replace findings. + while IFS=$'\t' read -r lhs rhs line; do + case "$rhs" in + ./*|../*) + finding "$repo" "go_local_replace" "${gomod#"$WORK"/}" "$line" + continue + ;; + /*) + # Absolute-path replace can resolve outside the checked-out tree + # on writable runners — REFERENCE.md flags this as a real + # hardening-gate bypass, treat as IOC-grade. + finding "$repo" "go_suspicious_replace" "${gomod#"$WORK"/}" "absolute-path replace: $line" + continue + ;; + esac + # Host-allowlist gate. + if ! printf '%s\n' "$rhs" | grep -qE "^($GO_TRUSTED_HOSTS)(/|$)"; then + finding "$repo" "go_suspicious_replace" "${gomod#"$WORK"/}" "$line" + continue + fi + # Path-equality gate (defends Scenario C in-org compromise where + # both LHS and RHS sit under an allowlisted host). The top-level + # 3-segment path of LHS and RHS must match unless explicitly + # allowlisted via GO_REPLACE_ALLOWED_FILE. + lhs_top="$(printf '%s\n' "$lhs" | awk -F/ 'NF>=3{print $1"/"$2"/"$3} NF<3{print $0}')" + rhs_top="$(printf '%s\n' "$rhs" | awk -F/ 'NF>=3{print $1"/"$2"/"$3} NF<3{print $0}')" + if [ "$lhs_top" != "$rhs_top" ]; then + if ! grep -qFx "$(printf '%s\t%s' "$lhs_top" "$rhs_top")" "$GO_REPLACE_ALLOW_NORM" 2>/dev/null; then + finding "$repo" "go_replace_path_mismatch" "${gomod#"$WORK"/}" "top-level path mismatch: $line" + fi + fi + done < <(awk ' + /^[[:space:]]*replace[[:space:]]*\(/ { inblock=1; next } + inblock && /^[[:space:]]*\)/ { inblock=0; next } + /^[[:space:]]*replace[[:space:]]/ || inblock { + n = index($0, "=>") + if (n == 0) next + lhs_raw = substr($0, 1, n - 1) + rhs_raw = substr($0, n + 2) + sub(/^[[:space:]]*replace[[:space:]]+/, "", lhs_raw) + sub(/^[[:space:]]+/, "", lhs_raw); sub(/[[:space:]]+$/, "", lhs_raw) + sub(/[[:space:]]+v[0-9].*$/, "", lhs_raw) + sub(/^[[:space:]]+/, "", rhs_raw); sub(/[[:space:]]+v[0-9].*$/, "", rhs_raw) + sub(/[[:space:]]+$/, "", rhs_raw) + if (rhs_raw == "") next + gsub(/\t/, " ", $0) + printf "%s\t%s\t%s\n", lhs_raw, rhs_raw, $0 + } + ' "$gomod") + done < "$GOMOD_FILE" + + # 6. Go: dangerous env settings in workflows. The grep is now run against + # the workflow file with `#`-prefixed lines stripped, so a documentation + # comment like `# Forbid GOSUMDB=off in CI` does not trigger a false + # `go_unsafe_env` finding. A second pass looks for indirect references via + # `vars`/`secrets`/`env`/`inputs` contexts that would otherwise slip past + # a literal-only grep. + while IFS= read -r wf; do + if sed 's/[[:space:]]*#.*$//' "$wf" \ + | grep -qE 'GONOSUMCHECK|GOSUMDB[=:][[:space:]]*["'\'']?off|GOINSECURE[=:]|GOFLAGS[=:].*-insecure'; then + finding "$repo" "go_unsafe_env" "${wf#"$WORK"/}" "GOSUMDB/sumcheck/insecure-fetch bypassed in CI" + fi + if grep -qE '\$\{\{[[:space:]]*(vars|secrets|env|inputs|matrix)\.(GOSUMDB|GOINSECURE|GOFLAGS|GONOSUMCHECK)' "$wf"; then + finding "$repo" "go_unsafe_env_indirect" "${wf#"$WORK"/}" "indirect Go env reference — review for runtime bypass" + fi + done < "$WF_FILE" + + # End-of-repo cleanup — preserve the working tree for forensic inspection + # whenever this repo emitted any IOC-grade finding. REFERENCE.md §Incident + # response requires the matched file to be inspectable to confirm the + # multi-IOC gate; deleting the clone forces a re-clone (point-in-time + # evidence may have moved). Clean repos are still removed to bound disk. + if grep -qFx "$repo" "$DIRTY_REPOS_FILE" 2>/dev/null; then + mkdir -p "$EVIDENCE_DIR/$(dirname "$repo")" + mv "$WORK" "$EVIDENCE_DIR/$repo" 2>/dev/null || true + else + rm -rf "$WORK" + fi +done < "$OUTPUT_DIR/repos.txt" + +# Drop the per-repo scratch index files now that aggregation is done. +rm -rf "$OUTPUT_DIR/.scan" 2>/dev/null || true + +# 7. Exfil repo search +log "Searching for public Shai-Hulud exfil repos..." +# Wrap the OR group in parentheses — without them GitHub search parses +# `OR` as a top-level Boolean operator and the second branch becomes +# unscoped (`shai_hulud` matches any repo on GitHub). Without parens, any +# external attacker can create a `shai_hulud-*` repo to poison every org's +# sweep with a false `public_exfil_repo` for their own org. +EXFIL_OUT="$OUTPUT_DIR/.exfil.out" +EXFIL_ERR="$OUTPUT_DIR/.exfil.err" +if gh api -X GET search/repositories -f q="org:$ORG (shai-hulud OR shai_hulud)" \ + --jq '.items[]?.full_name' > "$EXFIL_OUT" 2> "$EXFIL_ERR"; then + while IFS= read -r exfil; do + [ -n "$exfil" ] && finding "$exfil" "public_exfil_repo" "" "matches ^[Ss]hai-[Hh]ulud naming" + done < "$EXFIL_OUT" +else + finding "$ORG" "check_skipped" "" "org-scoped exfil search failed (rate limit or auth): $(head -1 "$EXFIL_ERR" 2>/dev/null)" +fi + +# Member-side exfil search requires `read:org`. Probe once; if scope is +# missing we emit a `check_skipped` operational finding instead of silently +# producing a false-clean for a compromised-member scenario. Rate-limit: +# GitHub authenticated search is 30 req/min, so insert a small pause per +# member to stay inside the budget. +MEMBERS_OUT="$OUTPUT_DIR/.members.out" +MEMBERS_ERR="$OUTPUT_DIR/.members.err" +if gh api orgs/"$ORG"/members --paginate --jq '.[].login' \ + > "$MEMBERS_OUT" 2> "$MEMBERS_ERR"; then + while IFS= read -r member; do + [ -n "$member" ] || continue + MEMBER_OUT="$OUTPUT_DIR/.member.$member.out" + MEMBER_ERR="$OUTPUT_DIR/.member.$member.err" + if gh api -X GET "search/repositories" -f q="user:$member shai-hulud" \ + --jq '.items[]?.full_name' > "$MEMBER_OUT" 2> "$MEMBER_ERR"; then + while IFS= read -r hit; do + [ -n "$hit" ] && finding "$hit" "public_exfil_repo_member" "" "shai-hulud-named repo under org member" + done < "$MEMBER_OUT" + else + finding "$member" "check_skipped" "" "per-member exfil search failed (likely search rate limit): $(head -1 "$MEMBER_ERR" 2>/dev/null)" + fi + rm -f "$MEMBER_OUT" "$MEMBER_ERR" + sleep 2 + done < "$MEMBERS_OUT" +else + finding "$ORG" "check_skipped" "" "orgs/$ORG/members enumeration failed (token likely lacks read:org scope): $(head -1 "$MEMBERS_ERR" 2>/dev/null)" +fi +rm -f "$EXFIL_OUT" "$EXFIL_ERR" "$MEMBERS_OUT" "$MEMBERS_ERR" + +# Aggregate NDJSON → JSON array. +jq -s '.' "$FINDINGS_NDJSON" > "$FINDINGS" + +# Split IOC findings from operational findings so a single transient +# clone_failed / check_skipped / go_local_replace does not trigger +# `INVOKE INCIDENT RESPONSE`. Exit codes: +# 0 — clean (no findings at all) +# 1 — IOC findings present (incident response) +# 2 — operational issues only (review, but not an incident) +IOC_COUNT=$(jq --arg re "$IOC_RULES_RE" '[.[] | select(.rule | test($re))] | length' "$FINDINGS") +OP_COUNT=$(jq --arg re "$IOC_RULES_RE" '[.[] | select(.rule | test($re) | not)] | length' "$FINDINGS") +TOTAL_COUNT=$(jq 'length' "$FINDINGS") + +{ + echo "# Shai-Hulud IOC sweep — $ORG" + echo + echo "**Run:** $(date -u)" + echo "**Repos scanned:** $REPO_COUNT" + echo "**IOC findings:** $IOC_COUNT" + echo "**Operational findings:** $OP_COUNT" + echo "**Total findings:** $TOTAL_COUNT" + echo + if [ "$IOC_COUNT" -gt 0 ]; then + echo "## IOC findings" + jq -r --arg re "$IOC_RULES_RE" '.[] | select(.rule | test($re)) | "- **\(.repo)** [\(.rule)] \(.path) — \(.detail)"' "$FINDINGS" + echo + echo "**INVOKE INCIDENT RESPONSE IMMEDIATELY.** See REFERENCE.md §Incident response." + echo "Forensic evidence preserved under: \`$EVIDENCE_DIR//\`." + fi + if [ "$OP_COUNT" -gt 0 ]; then + echo + echo "## Operational findings (review, not an incident)" + jq -r --arg re "$IOC_RULES_RE" '.[] | select(.rule | test($re) | not) | "- **\(.repo)** [\(.rule)] \(.path) — \(.detail)"' "$FINDINGS" + fi + if [ "$TOTAL_COUNT" -eq 0 ]; then + echo "Clean. No IOCs matched the current seed lists." + fi +} > "$SUMMARY" + +log "Done. Summary: $SUMMARY" +if [ "$IOC_COUNT" -gt 0 ]; then exit 1 +elif [ "$OP_COUNT" -gt 0 ]; then exit 2 +else exit 0 +fi diff --git a/scripts/shai-hulud-ioc-sweep.sh.sha256 b/scripts/shai-hulud-ioc-sweep.sh.sha256 new file mode 100644 index 0000000..53e1ea2 --- /dev/null +++ b/scripts/shai-hulud-ioc-sweep.sh.sha256 @@ -0,0 +1 @@ +ca903236e2c47fc9893494daa13ca8f6da1b5597cb6ea3f2361b8bc824114dbb scripts/shai-hulud-ioc-sweep.sh