From d9aea1afde86e48d85442b38cb3dc6de75a37a13 Mon Sep 17 00:00:00 2001 From: John Morrissey Date: Fri, 5 Jun 2026 21:55:02 +1000 Subject: [PATCH] fix(scan): gate on the unsuppressed population by default (secure --fail-on) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Close a HIGH-severity CI-gate bypass. `wardline scan --fail-on` applied repository-controlled suppressions (`.wardline/baseline.yaml`, `wardline.yaml` waivers, `.wardline/judged.yaml`) to findings BEFORE evaluating the gate, so a malicious PR could commit a suppression keyed to its own new defect's fingerprint and clear the gate. All three sources are committed repo content and equally exploitable. Reproduced live (baselining the sole ERROR zeroed the gate). Secure-by-default model (combines #24 + #25): - `gate_decision` now evaluates a separate UNSUPPRESSED population (`ScanResult.gate_findings`). baseline/waiver/judged still ANNOTATE the emitted findings (`suppressed=…` stays visible) but no longer clear the gate. - The gate population is built with apply_suppressions over EMPTY baseline + waivers + judged, NOT `list(raw)`, so the lineless-DEFECT→non-gating-FACT downgrade is preserved (no spurious gate trips). - `--new-since ` (operator-supplied, unforgeable) scopes BOTH the emitted and gate populations — the secure CI ratchet. - `--trust-suppressions` (CLI) / `trust_suppressions` (run_scan, MCP scan tool), default False, restores the local ratchet / judge DX for trusted checkouts (None sentinel → gate falls back to the suppressed findings). `run_judge` passes True so judge/triage/persist are unchanged. - `load_judged` now requires `verdict: FALSE_POSITIVE` (rejects a hand-edited TRUE_POSITIVE / missing verdict smuggled in as a silent suppression). BREAKING (noted in CHANGELOG, acceptable at 0.x): baseline-gated CI goes green→red on upgrade until `--new-since` or `--trust-suppressions` is added. Docs updated (suppression.md): the secure CI ratchet is `--new-since`. Combines and supersedes #24 (judged-only) and #25 (no escape hatch + a lineless-DEFECT gate bug). Full suite green (2394 passed), ruff + mypy clean. Co-Authored-By: Claude Opus 4.8 (1M context) --- CHANGELOG.md | 26 +++ docs/guides/suppression.md | 47 ++++- src/wardline/cli/scan.py | 14 ++ src/wardline/core/judge_run.py | 4 + src/wardline/core/judged.py | 9 + src/wardline/core/run.py | 79 ++++++-- src/wardline/mcp/server.py | 14 +- tests/unit/cli/test_cli.py | 53 +++++- tests/unit/core/test_judge_run.py | 18 ++ tests/unit/core/test_judged.py | 46 ++++- tests/unit/core/test_run.py | 221 +++++++++++++++++++++- tests/unit/mcp/test_server_suppression.py | 23 +++ 12 files changed, 518 insertions(+), 36 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e3e21383..94cebf2e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -86,6 +86,32 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 Wardline-private bit would break fact reconciliation entirely. Closing it fully needs a Clarion read-path contract change; the keying site carries an explicit comment. This path is opt-in and not the scan gate, so impact is lower. +- **The `--fail-on` gate no longer honours repository-controlled suppressions by + default (closes a CI-gate bypass).** `.wardline/baseline.yaml`, `wardline.yaml` + waivers, and `.wardline/judged.yaml` are all committed repository content, so a + malicious pull request could add a suppression entry keyed to its own new defect's + fingerprint and clear the gate. The gate now evaluates the **unsuppressed** + population by default; baseline / waiver / judged still **annotate** the emitted + findings (`suppressed: baselined | waived | judged`) but cannot clear the gate. The + secure CI ratchet is the operator-supplied, unforgeable `--new-since `, + which scopes **both** the emitted findings and the gate. A new `--trust-suppressions` + flag (CLI) / `trust_suppressions` arg (MCP `scan`), default false, restores the old + post-suppression gate for **trusted local checkouts** (and is what the `judge` + workflow uses internally). `.wardline/judged.yaml` records now also **require** + `verdict: FALSE_POSITIVE` on load — a missing or non-FP verdict is rejected, so a + hand-edited judged entry cannot be smuggled in as a silent suppression + (`build_judged_document` always emits it, so machine round-trips stay valid). New + `ScanResult.gate_findings` field carries the unsuppressed gate population (None + sentinel = trust suppressions / fall back to `findings`). + + > **BREAKING (acceptable at 0.x):** a CI job that relies on a committed baseline + > (or waiver / judged file) to keep `wardline scan --fail-on=…` green will now go + > **red** on upgrade, because the baselined defects re-enter the gate population. Add + > `--new-since ` (recommended for CI) or `--trust-suppressions` (trusted + > checkouts only) to restore a passing gate. Note: legis's scan artifact and the + > "one judge / reproduces Wardline's gate population exactly" property are derived + > from the annotated `findings`, so they continue to reflect the suppressed view; + > only the local `--fail-on` exit code changed. - **Dangerous-sink rules now see lambda bodies (closes a false-green).** `_own_calls` treated `ast.Lambda` as a separate scope and only inspected lambda *default* expressions, so a sink reached inside a lambda *body* — `cb = lambda: eval(src)`, diff --git a/docs/guides/suppression.md b/docs/guides/suppression.md index 6ee7d45f..7c3946a9 100644 --- a/docs/guides/suppression.md +++ b/docs/guides/suppression.md @@ -21,11 +21,45 @@ $ wardline scan . scanned 2 file(s); 4 finding(s) — 1 suppressed (1 baseline / 0 waiver / 0 judged), 0 new -> findings.jsonl ``` +## Suppressions and the `--fail-on` gate (read this first) + +All three layers — baseline, waiver, judged — live in **committed repository +content** (`.wardline/baseline.yaml`, `wardline.yaml`, `.wardline/judged.yaml`). +That makes them attacker-controllable in an untrusted pull request: a PR can add a +suppression entry keyed to its own new defect's fingerprint. + +So, **by default the `--fail-on` gate evaluates the *unsuppressed* population.** +Baseline / waiver / judged still **annotate** every emitted finding (you see +`suppressed: baselined | waived | judged` in the output) — they just do **not** +clear the gate. A self-suppressing PR therefore still goes red. + +Two ways to scope or relax the gate, depending on trust: + +- **`--new-since ` — the secure CI ratchet.** The git ref is supplied + by the operator (the pipeline), not by repository content, so it is unforgeable. + It scopes **both** the emitted findings and the gate to findings new since the + ref: a pre-existing defect outside the delta does not trip; a new one inside it + does, and no committed suppression can clear it. This is the recommended adopt-an- + existing-codebase pattern in CI. +- **`--trust-suppressions` — trusted local checkouts only.** Restores the old + behaviour: baseline / waiver / judged clear the gate. Use it when you are running + Wardline on a checkout you trust (your own working tree, the judge DX loop). **Do + not** enable it in CI on untrusted PR content. + +The MCP `scan` tool mirrors this exactly: `new_since` and a `trust_suppressions` +boolean (default false). + ## Baseline A baseline is a git-committable snapshot of findings you accept as-is. It is the -fast on-ramp for an existing project: capture everything once, then let the -`--fail-on` gate fire only on findings that appear *after* the snapshot. +fast on-ramp for an existing project: capture everything once so they are +annotated as `baselined` in scan output. + +Note (changed): a baseline **annotates** but no longer clears the `--fail-on` +gate by default — see [Suppressions and the `--fail-on` gate](#suppressions-and-the-fail-on-gate-read-this-first) +above. To make the gate "fire only on findings that appear after the snapshot", +use the unforgeable `--new-since ` ratchet in CI, or +`--trust-suppressions` on a trusted local checkout. ``` wardline baseline [OPTIONS] COMMAND [ARGS]... @@ -136,8 +170,13 @@ findings: Commit `.wardline/judged.yaml` like the baseline. A judged suppression is advisory — the rationale is recorded precisely so a human can audit it and revert -by deleting the entry. See the [LLM triage judge](judge.md) guide for how -verdicts are produced and the `--write` confidence floor. +by deleting the entry. Like the other layers it **annotates** but does not clear +the `--fail-on` gate by default (see [the gate section](#suppressions-and-the-fail-on-gate-read-this-first)); +the `judge` workflow itself always consults judged records. Each record must carry +`verdict: FALSE_POSITIVE` — a record without it, or with any other verdict, is +rejected on load so a hand-edited entry cannot become a silent suppression. See the +[LLM triage judge](judge.md) guide for how verdicts are produced and the `--write` +confidence floor. ## A note on line sensitivity diff --git a/src/wardline/cli/scan.py b/src/wardline/cli/scan.py index 2587b189..c0525803 100644 --- a/src/wardline/cli/scan.py +++ b/src/wardline/cli/scan.py @@ -102,6 +102,17 @@ default=False, help="Allow wardline.yaml source_roots to resolve outside PATH.", ) +@click.option( + "--trust-suppressions", + is_flag=True, + default=False, + help=( + "Let repository-controlled baseline/waiver/judged files clear the --fail-on gate " + "(they always annotate findings regardless). Use ONLY for trusted local checkouts; " + "in CI prefer the unforgeable --new-since ratchet. Default off: by " + "default the gate evaluates the unsuppressed population so a PR cannot self-suppress." + ), +) def scan( path: Path, config_path: Path | None, @@ -119,6 +130,7 @@ def scan( yes: bool, strict_defaults: bool, allow_source_root_escape: bool, + trust_suppressions: bool, ) -> None: """Scan PATH for findings.""" if fmt == "sarif": @@ -158,6 +170,7 @@ def scan( trusted_packs=trusted_packs, strict_defaults=strict_defaults, confine_to_root=not allow_source_root_escape, + trust_suppressions=trust_suppressions, ) findings = result.findings if fix: @@ -193,6 +206,7 @@ def confirm_cb(rel_path: str, orig: str, replacement: str, f: Finding) -> bool: trusted_packs=trusted_packs, strict_defaults=strict_defaults, confine_to_root=not allow_source_root_escape, + trust_suppressions=trust_suppressions, ) findings = result.findings if fmt == "sarif": diff --git a/src/wardline/core/judge_run.py b/src/wardline/core/judge_run.py index bce78b56..13f755c5 100644 --- a/src/wardline/core/judge_run.py +++ b/src/wardline/core/judge_run.py @@ -179,6 +179,10 @@ def _default_caller(req: JudgeRequest) -> JudgeResponse: trust_local_packs=trust_local_packs, trusted_packs=trusted_packs, strict_defaults=strict_defaults, + # The judge flow is the trusted local path: it consults judged records. The + # emitted ``findings`` are always judged-annotated regardless of this flag; + # passing True keeps the gate (if any consumer reads it) on the trusted set too. + trust_suppressions=True, ) judged_set = load_judged(root / ".wardline" / "judged.yaml") diff --git a/src/wardline/core/judged.py b/src/wardline/core/judged.py index 7dfcabce..06873371 100644 --- a/src/wardline/core/judged.py +++ b/src/wardline/core/judged.py @@ -110,6 +110,15 @@ def load_judged(path: Path) -> JudgedSet: if fp in seen: raise ConfigError(f"{path.name} findings[{idx}]: duplicate fingerprint {fp!r}") seen.add(fp) + # A judged record suppresses a finding ONLY as a FALSE_POSITIVE verdict. Require + # the field and reject any other value so a hand-edited TRUE_POSITIVE (or a + # missing verdict) cannot be smuggled in as a silent suppression. write_judged + # always emits verdict: FALSE_POSITIVE, so machine round-trips stay valid. + verdict = _require_str(e, "verdict", idx, path.name) + if verdict != "FALSE_POSITIVE": + raise ConfigError( + f"{path.name} findings[{idx}].verdict must be FALSE_POSITIVE, got {verdict!r}" + ) rationale = _require_str(e, "rationale", idx, path.name) # Provenance is the audit primitive — never default it. A judged record with # no attributable model / policy / confidence is an unauditable suppression. diff --git a/src/wardline/core/run.py b/src/wardline/core/run.py index 1dceb69a..a1100222 100644 --- a/src/wardline/core/run.py +++ b/src/wardline/core/run.py @@ -15,7 +15,7 @@ from typing import TYPE_CHECKING from wardline.core import config as config_mod -from wardline.core.baseline import load_baseline +from wardline.core.baseline import Baseline, load_baseline from wardline.core.delta import get_affected_entities, get_changed_files_since from wardline.core.discovery import discover, missing_source_roots from wardline.core.errors import ConfigError @@ -45,7 +45,8 @@ def _fp(*parts: str) -> str: @dataclass(frozen=True, slots=True) class ScanSummary: total: int # every finding (defects + facts/metrics) - active: int # non-suppressed DEFECTs — the gate population + active: int # non-suppressed DEFECTs in the emitted findings (NOT the gate population — + # the gate evaluates ScanResult.gate_findings unless --trust-suppressions) baselined: int waived: int judged: int @@ -66,6 +67,14 @@ class ScanResult: # this exact run instead of re-deriving. Never serialised over MCP. context: AnalysisContext | None scanned_paths: tuple[str, ...] = () + # The UNSUPPRESSED gate population (None SENTINEL — never a falsy-empty fallback). + # Repository-controlled baseline/waiver/judged still ANNOTATE ``findings`` (visible + # as ``suppressed=…``), but a malicious PR must not be able to clear the ``--fail-on`` + # gate by committing a suppression keyed to its own new defect. ``gate_decision`` + # evaluates this when it is not None, else falls back to ``findings`` (the trusted, + # local ``--trust-suppressions`` / directly-constructed-ScanResult behaviour). It is + # scoped by ``--new-since`` identically to ``findings``. + gate_findings: list[Finding] | None = None @dataclass(frozen=True, slots=True) @@ -85,6 +94,7 @@ def run_scan( trust_local_packs: bool = False, trusted_packs: tuple[str, ...] = (), strict_defaults: bool = False, + trust_suppressions: bool = False, ) -> ScanResult: """Discover → analyze → apply suppressions. Pure function of (disk + config). @@ -94,6 +104,16 @@ def run_scan( ``confine_to_root`` (default True) makes ``discover`` reject any ``source_root`` that resolves outside ``root``. Callers that intentionally scan outside the project root must opt out explicitly. + + ``trust_suppressions`` (default False) is the SECURITY default. When False the + ``--fail-on`` gate evaluates a separately-built UNSUPPRESSED population + (``ScanResult.gate_findings``): repository-controlled baseline/waiver/judged + files still annotate the emitted ``findings`` but cannot clear the gate, so a + malicious PR cannot self-suppress its own new defect. When True the gate falls + back to the suppressed ``findings`` (``gate_findings`` is set to None) — the + trusted local / judge-DX behaviour, an explicit operator trust decision suitable + only for a trusted checkout, never for enforcement on untrusted PR content. The + secure CI ratchet is the operator-supplied, unforgeable ``--new-since`` instead. """ from wardline.scanner.analyzer import build_analyzer from wardline.scanner.grammar import TrustGrammar, default_grammar @@ -185,7 +205,21 @@ def run_scan( baseline = load_baseline(root / ".wardline" / "baseline.yaml") waivers = WaiverSet(parse_waivers(cfg.waivers)) judged = load_judged(root / ".wardline" / "judged.yaml") - findings = apply_suppressions(raw, baseline, waivers, today=date.today(), judged=judged) + today = date.today() + # The emitted findings ALWAYS carry the full suppression annotations (baseline, + # waiver, judged) so ``suppressed=…`` is visible in output regardless of trust. + findings = apply_suppressions(raw, baseline, waivers, today=today, judged=judged) + # The gate population applies ZERO suppression but runs the SAME structural + # transforms apply_suppressions does (esp. the lineless-DEFECT→non-gating-FACT + # downgrade), so the only difference vs ``findings`` is the suppression sources — + # NOT ``list(raw)``, which would let a lineless DEFECT trip the gate. When the + # operator trusts repo suppressions, gate_findings is None and the gate falls back + # to the suppressed ``findings`` (None SENTINEL, never an accidental falsy-empty). + gate_findings: list[Finding] | None + if trust_suppressions: + gate_findings = None + else: + gate_findings = apply_suppressions(raw, Baseline(frozenset()), WaiverSet([]), today=today, judged=None) if new_since is not None: changed_files = get_changed_files_since(new_since, root) @@ -195,18 +229,26 @@ def run_scan( else: affected = set() - new_findings = [] - for f in findings: - if f.kind is Kind.DEFECT and f.suppressed is SuppressionState.ACTIVE: - is_new = (f.location.path in changed_files) or (f.qualname is not None and f.qualname in affected) - if not is_new: - f = replace( - f, - suppressed=SuppressionState.BASELINED, - suppression_reason=f"delta: unchanged since {new_since}", - ) - new_findings.append(f) - findings = new_findings + def apply_delta_scope(candidates: list[Finding]) -> list[Finding]: + # Suppress any ACTIVE defect outside the delta so the gate only fires on + # findings new since ``new_since``. Applied to BOTH emitted and gate + # populations so the operator-supplied (unforgeable) ratchet scopes the gate. + scoped: list[Finding] = [] + for f in candidates: + if f.kind is Kind.DEFECT and f.suppressed is SuppressionState.ACTIVE: + is_new = (f.location.path in changed_files) or (f.qualname is not None and f.qualname in affected) + if not is_new: + f = replace( + f, + suppressed=SuppressionState.BASELINED, + suppression_reason=f"delta: unchanged since {new_since}", + ) + scoped.append(f) + return scoped + + findings = apply_delta_scope(findings) + if gate_findings is not None: + gate_findings = apply_delta_scope(gate_findings) defects = [f for f in findings if f.kind is Kind.DEFECT] summary = ScanSummary( @@ -227,6 +269,7 @@ def run_scan( path.relative_to(resolved_root).as_posix() if path.is_relative_to(resolved_root) else path.as_posix() for path in files ), + gate_findings=gate_findings, ) @@ -234,5 +277,9 @@ def gate_decision(result: ScanResult, fail_on: Severity | None) -> GateDecision: """Translate a scan into a pass/fail verdict. A trip is data, not an error.""" if fail_on is None: return GateDecision(tripped=False, fail_on=None, exit_class=0) - tripped = gate_trips(result.findings, fail_on) + # None SENTINEL: evaluate the unsuppressed gate population when present (secure + # default), else the suppressed ``findings`` (trusted ``--trust-suppressions`` / + # a directly-constructed ScanResult with no gate_findings). + gate_population = result.gate_findings if result.gate_findings is not None else result.findings + tripped = gate_trips(gate_population, fail_on) return GateDecision(tripped=tripped, fail_on=fail_on.value, exit_class=1 if tripped else 0) diff --git a/src/wardline/mcp/server.py b/src/wardline/mcp/server.py index 5732c774..44668653 100644 --- a/src/wardline/mcp/server.py +++ b/src/wardline/mcp/server.py @@ -189,6 +189,7 @@ def _scan( new_since = args.get("new_since") trusted_packs = _trusted_packs_arg(args) cache_dir = _cache_dir_arg(args, root) + trust_suppressions = bool(args.get("trust_suppressions") or False) result = run_scan( path, config_path=_cfg(args, root), @@ -198,6 +199,7 @@ def _scan( trust_local_packs=trust_local_packs, trusted_packs=trusted_packs, strict_defaults=strict_defaults, + trust_suppressions=trust_suppressions, ) # Fail-soft Clarion write: only when a client was injected (server has a URL). # An outage/403 yields a not-reachable WriteResult; never raises here. @@ -722,7 +724,10 @@ def _register_tools(self) -> None: Tool( name="scan", description="Whole-program taint scan of the project. Returns structured " - "findings, the suppression summary (active = the gate population), " + "findings, the suppression summary (active = unsuppressed defects; " + "by default the --fail-on gate evaluates the UNSUPPRESSED population so " + "repo-controlled baseline/waiver/judged annotate but do not clear it — " + "pass `trust_suppressions: true` for the trusted-local behaviour), " "and the gate verdict. Pass `where` to filter the returned findings " "(conjunctive; summary/gate stay whole-project) and `explain: true` to inline " "each active defect's taint provenance — one call, no per-finding explain_taint. " @@ -778,6 +783,13 @@ def _register_tools(self) -> None: "type": "boolean", "description": "Ignore repository-supplied custom configuration overrides (wardline.yaml)", }, + "trust_suppressions": { + "type": "boolean", + "description": "Let repository-controlled baseline/waiver/judged clear the gate " + "(they always annotate findings regardless). Default false — the gate " + "evaluates the unsuppressed population so a PR cannot self-suppress its " + "own defect. Use only on a trusted checkout; in CI prefer new_since.", + }, }, }, handler=lambda args, root: _scan( diff --git a/tests/unit/cli/test_cli.py b/tests/unit/cli/test_cli.py index 45a0aed6..dd7f592c 100644 --- a/tests/unit/cli/test_cli.py +++ b/tests/unit/cli/test_cli.py @@ -343,7 +343,7 @@ def test_scan_fail_on_inert_without_flag(tmp_path) -> None: assert res.exit_code == 0, res.output # no --fail-on -> never gates -def test_scan_baseline_suppresses_and_clears_gate(tmp_path) -> None: +def test_scan_baseline_annotates_but_does_not_clear_gate(tmp_path) -> None: proj = tmp_path / "proj" proj.mkdir() _write(proj, "svc.py", _LEAKY) @@ -359,15 +359,37 @@ def test_scan_baseline_suppresses_and_clears_gate(tmp_path) -> None: "version: 1\nentries:\n - fingerprint: " + fp + "\n rule_id: PY-WL-101\n path: svc.py\n message: m\n", encoding="utf-8", ) - # Second scan: the defect is baselined -> annotated + gate clears. + # SECURITY default: the defect is baselined for REPORTING (annotated), but the + # repository-controlled baseline must NOT clear the --fail-on gate. res = CliRunner().invoke(scan, [str(proj), "--output", str(out), "--fail-on", "ERROR"]) - assert res.exit_code == 0, res.output + assert res.exit_code == 1, res.output findings2 = [_json.loads(ln) for ln in out.read_text().splitlines() if ln.strip()] leak = next(f for f in findings2 if f["rule_id"] == "PY-WL-101") assert leak["suppressed"] == "baselined" # annotate-and-keep assert "1 suppressed" in res.output +def test_scan_baseline_clears_gate_with_trust_suppressions(tmp_path) -> None: + # --trust-suppressions restores the local ratchet: a baselined defect clears the gate. + proj = tmp_path / "proj" + proj.mkdir() + _write(proj, "svc.py", _LEAKY) + out = tmp_path / "f.jsonl" + CliRunner().invoke(scan, [str(proj), "--output", str(out)]) + findings = [_json.loads(ln) for ln in out.read_text().splitlines() if ln.strip()] + fp = next(f["fingerprint"] for f in findings if f["rule_id"] == "PY-WL-101") + bl = proj / ".wardline" / "baseline.yaml" + bl.parent.mkdir(parents=True, exist_ok=True) + bl.write_text( + "version: 1\nentries:\n - fingerprint: " + fp + "\n rule_id: PY-WL-101\n path: svc.py\n message: m\n", + encoding="utf-8", + ) + res = CliRunner().invoke( + scan, [str(proj), "--output", str(out), "--fail-on", "ERROR", "--trust-suppressions"] + ) + assert res.exit_code == 0, res.output + + _UNPARSEABLE = "def f(:\n" # syntax error -> WLN-ENGINE-PARSE-ERROR FACT @@ -466,10 +488,14 @@ def test_baseline_create_writes_file_and_suppresses_next_scan(tmp_path) -> None: doc = _yaml.safe_load(bl.read_text()) assert doc["version"] == 1 and len(doc["entries"]) >= 1 assert "baselined" in res.output - # Next scan: the captured defect is now baselined, gate clears. + # SECURITY default: the captured defect is now baselined for reporting, but the + # untrusted repository baseline must NOT clear the fail-on gate. out = tmp_path / "f.jsonl" res2 = runner.invoke(scan, [str(proj), "--output", str(out), "--fail-on", "ERROR"]) - assert res2.exit_code == 0, res2.output + assert res2.exit_code == 1, res2.output + # ...and --trust-suppressions restores the local ratchet (gate clears). + res3 = runner.invoke(scan, [str(proj), "--output", str(out), "--fail-on", "ERROR", "--trust-suppressions"]) + assert res3.exit_code == 0, res3.output def test_baseline_create_refuses_if_exists(tmp_path) -> None: @@ -1000,9 +1026,10 @@ def test_judge_low_confidence_fp_held_back_from_write(monkeypatch, tmp_path) -> assert not (proj / ".wardline" / "judged.yaml").exists() -def test_judge_write_then_scan_gate_is_cleared(monkeypatch, tmp_path) -> None: - # The regression that pins the headline panel finding: a JUDGED FP written by - # `judge --write` must suppress the finding for `scan --fail-on` too. +def test_judge_write_then_scan_still_trips_gate_by_default(monkeypatch, tmp_path) -> None: + # SECURITY: judged.yaml is repository-controlled input. A judged FP written by + # `judge --write` still ANNOTATES the finding (summary shows it) but must NOT clear + # the `scan --fail-on` gate by default. --trust-suppressions restores the old behaviour. import wardline.cli.judge as judge_cli from wardline.cli.main import cli @@ -1017,10 +1044,16 @@ def test_judge_write_then_scan_gate_is_cleared(monkeypatch, tmp_path) -> None: jres = CliRunner().invoke(cli, ["judge", str(proj), "--write"]) assert jres.exit_code == 0, jres.output assert (proj / ".wardline" / "judged.yaml").exists() - # 3) scan now sees the JUDGED suppression -> gate cleared, summary shows it + # 3) scan now sees the JUDGED suppression as an annotation, but the gate STILL trips. after = CliRunner().invoke(cli, ["scan", str(proj), "--output", str(out), "--fail-on", "INFO"]) - assert after.exit_code == 0, after.output + assert after.exit_code == 1, after.output assert "judged" in after.output + # 4) ...and --trust-suppressions clears the gate (trusted local checkout). + trusted = CliRunner().invoke( + cli, ["scan", str(proj), "--output", str(out), "--fail-on", "INFO", "--trust-suppressions"] + ) + assert trusted.exit_code == 0, trusted.output + assert "judged" in trusted.output def test_scan_fix_and_fix_command(tmp_path: Path) -> None: diff --git a/tests/unit/core/test_judge_run.py b/tests/unit/core/test_judge_run.py index 4b56be50..401226cf 100644 --- a/tests/unit/core/test_judge_run.py +++ b/tests/unit/core/test_judge_run.py @@ -93,6 +93,24 @@ def test_run_judge_write_holds_back_low_confidence_fp(tmp_path: Path) -> None: assert not (root / ".wardline" / "judged.yaml").exists() +def test_judge_workflow_still_consults_judged_after_write(tmp_path: Path) -> None: + # The judge flow is the TRUSTED local path: judged.yaml records are still consulted + # after `judge --write`, unchanged by the suppression-trust default. run_judge calls + # run_scan(trust_suppressions=True), and the emitted findings always carry the JUDGED + # annotation regardless of the flag — so the prior FP stays suppressed for the judge. + root = _leaky_project(tmp_path) + # 1) write a high-confidence FP for the active defect + first = run_judge(root, judge_caller=_fp_caller(0.95), write=True) + assert first.wrote >= 1 + assert (root / ".wardline" / "judged.yaml").exists() + # 2) the scan run_judge builds (trust_suppressions=True) now sees that defect as JUDGED + rescanned = run_scan(root, trust_suppressions=True) + judged_defects = [ + f for f in rescanned.findings if f.kind is Kind.DEFECT and f.suppressed is SuppressionState.JUDGED + ] + assert judged_defects, "the judged FP must remain consulted on the judge re-run" + + def test_run_judge_ignores_project_floor_without_trust(tmp_path: Path) -> None: root = _leaky_project(tmp_path) (root / "wardline.yaml").write_text("judge:\n write_confidence_floor: 0.0\n", encoding="utf-8") diff --git a/tests/unit/core/test_judged.py b/tests/unit/core/test_judged.py index 92324fb5..23ce62be 100644 --- a/tests/unit/core/test_judged.py +++ b/tests/unit/core/test_judged.py @@ -71,19 +71,61 @@ def test_rejudge_updates_existing_record(tmp_path: Path) -> None: def test_missing_provenance_raises(tmp_path: Path) -> None: # model_id / policy_hash / confidence are the audit primitive — never defaulted. + # verdict is present so this exercises the PROVENANCE guard, not the verdict guard. path = tmp_path / "judged.yaml" - path.write_text(f"version: 1\nfindings:\n - fingerprint: {'a' * 64}\n rationale: x\n", encoding="utf-8") + path.write_text( + f"version: 1\nfindings:\n - fingerprint: {'a' * 64}\n verdict: FALSE_POSITIVE\n rationale: x\n", + encoding="utf-8", + ) with pytest.raises(ConfigError): load_judged(path) def test_out_of_range_confidence_raises(tmp_path: Path) -> None: + # verdict is present so this reaches the confidence range check, not the verdict guard. path = tmp_path / "judged.yaml" path.write_text( "version: 1\nfindings:\n" - f" - fingerprint: {'a' * 64}\n rationale: x\n model_id: m\n" + f" - fingerprint: {'a' * 64}\n verdict: FALSE_POSITIVE\n rationale: x\n model_id: m\n" " policy_hash: sha256:x\n confidence: 1.5\n", encoding="utf-8", ) with pytest.raises(ConfigError): load_judged(path) + + +def test_missing_verdict_raises(tmp_path: Path) -> None: + # A judged record with no verdict cannot be trusted as a FALSE_POSITIVE suppression. + path = tmp_path / "judged.yaml" + path.write_text( + "version: 1\nfindings:\n" + f" - fingerprint: {'a' * 64}\n rationale: x\n model_id: m\n" + " policy_hash: sha256:x\n confidence: 0.9\n", + encoding="utf-8", + ) + with pytest.raises(ConfigError, match="verdict"): + load_judged(path) + + +def test_non_false_positive_verdict_rejected(tmp_path: Path) -> None: + # A hand-edited TRUE_POSITIVE (or any non-FP) verdict must not be smuggled in as a + # silent suppression — judged.yaml only ever records FALSE_POSITIVE. + path = tmp_path / "judged.yaml" + path.write_text( + "version: 1\nfindings:\n" + f" - fingerprint: {'a' * 64}\n verdict: TRUE_POSITIVE\n rationale: x\n model_id: m\n" + " policy_hash: sha256:x\n confidence: 0.9\n", + encoding="utf-8", + ) + with pytest.raises(ConfigError, match="FALSE_POSITIVE"): + load_judged(path) + + +def test_write_judged_roundtrip_loads_with_verdict(tmp_path: Path) -> None: + # build_judged_document always emits verdict: FALSE_POSITIVE, so a machine round-trip + # stays valid under the new verdict requirement. + path = tmp_path / ".wardline" / "judged.yaml" + write_judged(path, [_fp()]) + doc = yaml.safe_load(path.read_text()) + assert doc["findings"][0]["verdict"] == "FALSE_POSITIVE" + assert load_judged(path).match("a" * 64) is not None diff --git a/tests/unit/core/test_run.py b/tests/unit/core/test_run.py index e8503a9a..11f7a47d 100644 --- a/tests/unit/core/test_run.py +++ b/tests/unit/core/test_run.py @@ -1,9 +1,13 @@ +import textwrap +from datetime import UTC, datetime from pathlib import Path +from unittest.mock import MagicMock, patch import pytest from wardline.core.errors import ConfigError -from wardline.core.finding import Kind, Severity, SuppressionState +from wardline.core.finding import Finding, Kind, Location, Severity, SuppressionState +from wardline.core.judged import JudgedFP, write_judged from wardline.core.run import ScanResult, ScanSummary, gate_decision, run_scan FIXTURE = Path("tests/fixtures/sample_project") @@ -28,7 +32,8 @@ def test_run_scan_returns_findings_summary_and_context() -> None: # invariants (total == len(findings); active == active-defect count), which # hold for any fixture regardless of finding count. assert result.summary.total == len(result.findings) - # active is the count of non-suppressed DEFECTs (the gate population) + # active is the count of non-suppressed DEFECTs in the emitted findings (the gate + # evaluates ScanResult.gate_findings, a separate unsuppressed population) active = sum(1 for f in result.findings if f.kind is Kind.DEFECT and f.suppressed is SuppressionState.ACTIVE) assert result.summary.active == active # context is carried for explain_finding to reuse @@ -111,10 +116,220 @@ def test_run_scan_baselined_count_distinguishes_categories(tmp_path: Path) -> No assert result.summary.waived == 0 assert result.summary.judged == 0 assert result.summary.active == 0 - # And the gate clears now that the only ERROR defect is suppressed. + # SECURITY default: a repository-controlled baseline ANNOTATES the defect but does + # NOT clear the --fail-on gate — the gate evaluates the unsuppressed population. + assert gate_decision(result, Severity.ERROR).tripped is True + # ...and --trust-suppressions restores the local ratchet: the baselined defect clears. + trusted = run_scan(proj, trust_suppressions=True) + assert trusted.summary.baselined == 1 + assert gate_decision(trusted, Severity.ERROR).tripped is False + + +def _leaky_proj(tmp_path: Path) -> tuple[Path, str]: + proj = tmp_path / "proj" + proj.mkdir() + (proj / "svc.py").write_text(_LEAKY, encoding="utf-8") + fp = next(f for f in run_scan(proj).findings if f.rule_id == "PY-WL-101").fingerprint + return proj, fp + + +def _write_baseline(proj: Path, fp: str) -> None: + bl = proj / ".wardline" / "baseline.yaml" + bl.parent.mkdir(parents=True, exist_ok=True) + bl.write_text( + f"version: 1\nentries:\n - fingerprint: {fp}\n rule_id: PY-WL-101\n path: svc.py\n message: m\n", + encoding="utf-8", + ) + + +def _write_waiver(proj: Path, fp: str) -> None: + (proj / "wardline.yaml").write_text( + f"waivers:\n - fingerprint: {fp}\n reason: validated downstream\n", encoding="utf-8" + ) + + +def _write_judged(proj: Path, fp: str) -> None: + write_judged( + proj / ".wardline" / "judged.yaml", + [ + JudgedFP( + fingerprint=fp, + rule_id="PY-WL-101", + path="svc.py", + message="m", + rationale="model ruled FP", + model_id="anthropic/claude-opus-4-8", + confidence=0.95, + recorded_at=datetime(2026, 5, 30, tzinfo=UTC), + policy_hash="sha256:abc", + ) + ], + ) + + +@pytest.mark.parametrize( + "writer,state", + [ + (_write_baseline, SuppressionState.BASELINED), + (_write_waiver, SuppressionState.WAIVED), + (_write_judged, SuppressionState.JUDGED), + ], +) +def test_gate_trips_by_default_on_suppressed_defect(tmp_path: Path, writer, state) -> None: + # SECURITY: a repository-controlled suppression (baseline / waiver / judged) ANNOTATES + # the defect but must NOT clear the --fail-on gate by default, so a malicious PR cannot + # self-suppress its own new defect. + proj, fp = _leaky_proj(tmp_path) + writer(proj, fp) + result = run_scan(proj) + # Annotated in the emitted findings... + leak = next(f for f in result.findings if f.rule_id == "PY-WL-101") + assert leak.suppressed is state + # ...but the gate still trips on the unsuppressed gate population. + assert gate_decision(result, Severity.ERROR).tripped is True + + +@pytest.mark.parametrize("writer", [_write_baseline, _write_waiver, _write_judged]) +def test_trust_suppressions_restores_old_gate_clearing(tmp_path: Path, writer) -> None: + proj, fp = _leaky_proj(tmp_path) + writer(proj, fp) + result = run_scan(proj, trust_suppressions=True) + # gate_findings is the None sentinel -> gate falls back to the suppressed findings. + assert result.gate_findings is None assert gate_decision(result, Severity.ERROR).tripped is False +def test_gate_findings_is_unsuppressed_population(tmp_path: Path) -> None: + proj, fp = _leaky_proj(tmp_path) + _write_baseline(proj, fp) + result = run_scan(proj) + assert result.gate_findings is not None + gate_leak = next(f for f in result.gate_findings if f.rule_id == "PY-WL-101") + assert gate_leak.suppressed is SuppressionState.ACTIVE # gate sees it active + + +def test_directly_constructed_scanresult_falls_back_to_findings() -> None: + # The None sentinel: a ScanResult built without gate_findings (e.g. in a test) must + # gate on its findings, never silently pass because gate_findings defaulted empty. + leak = Finding( + rule_id="PY-WL-101", + message="m", + severity=Severity.ERROR, + kind=Kind.DEFECT, + location=Location(path="svc.py", line_start=1), + fingerprint="a" * 64, + suppressed=SuppressionState.ACTIVE, + ) + result = ScanResult( + findings=[leak], + summary=ScanSummary(total=1, active=1, baselined=0, waived=0, judged=0), + files_scanned=1, + context=None, + ) + assert result.gate_findings is None + assert gate_decision(result, Severity.ERROR).tripped is True + + +def test_lineless_defect_does_not_trip_gate(tmp_path: Path) -> None: + # Regression guard for the bug PR #25 had (gate_findings = list(raw)): a lineless + # DEFECT must be downgraded to a non-gating FACT in the gate population, exactly as + # apply_suppressions does for the emitted findings — so it never trips the gate. + from wardline.core.baseline import Baseline + from wardline.core.finding import ENGINE_PATH # noqa: F401 (documents the carve-out) + from wardline.core.suppression import apply_suppressions, gate_trips + from wardline.core.waivers import WaiverSet + + lineless = Finding( + rule_id="PY-WL-101", + message="m", + severity=Severity.ERROR, + kind=Kind.DEFECT, + location=Location(path="svc.py", line_start=None), + fingerprint="b" * 64, + suppressed=SuppressionState.ACTIVE, + ) + # This is the EXACT empty-suppression transform run_scan applies to build gate_findings. + gate_pop = apply_suppressions([lineless], Baseline(frozenset()), WaiverSet([]), today=datetime.now(UTC).date()) + downgraded = next(f for f in gate_pop if f.location.path == "svc.py") + assert downgraded.kind is Kind.FACT # DEFECT -> FACT, no longer gating + assert gate_trips(gate_pop, Severity.ERROR) is False + + +def test_new_since_scopes_both_populations_and_resists_suppression(tmp_path: Path) -> None: + # --new-since is the SECURE CI ratchet: it scopes BOTH the emitted findings and the + # gate population. A pre-existing defect OUTSIDE the delta does not trip; a new one + # INSIDE the delta does, and a repo suppression cannot clear it. + callee_src = """ + from wardline.decorators import external_boundary + @external_boundary + def read_raw(p): + return p + """ + caller_src = """ + from callee import read_raw + from wardline.decorators import trusted + @trusted(level='ASSURED') + def f(p): + return read_raw(p) + """ + unrelated_src = """ + from wardline.decorators import external_boundary, trusted + @external_boundary + def read_raw_unrelated(p): + return p + @trusted(level='ASSURED') + def h(p): + return read_raw_unrelated(p) + """ + (tmp_path / "callee.py").write_text(textwrap.dedent(callee_src), encoding="utf-8") + (tmp_path / "caller.py").write_text(textwrap.dedent(caller_src), encoding="utf-8") + (tmp_path / "unrelated.py").write_text(textwrap.dedent(unrelated_src), encoding="utf-8") + + # Try to suppress the NEW (in-delta) defect via a committed baseline — must not help. + first = run_scan(tmp_path) + new_fp = next(f for f in first.findings if f.qualname == "caller.f").fingerprint + bl = tmp_path / ".wardline" / "baseline.yaml" + bl.parent.mkdir(parents=True, exist_ok=True) + bl.write_text( + f"version: 1\nentries:\n - fingerprint: {new_fp}\n rule_id: PY-WL-101\n path: caller.py\n" + " message: m\n", + encoding="utf-8", + ) + + with patch("subprocess.run") as mock_run: + + def run_dispatch(args, **kwargs): + if "rev-parse" in args and "--show-toplevel" in args: + m = MagicMock(returncode=0) + m.stdout = f"{tmp_path.resolve()}\n" + return m + if "rev-parse" in args and "--verify" in args: + m = MagicMock(returncode=0) + m.stdout = "abc123\n" + return m + if "diff" in args and "--name-only" in args: + m = MagicMock(returncode=0) + m.stdout = "callee.py\n" + return m + if "ls-files" in args: + m = MagicMock(returncode=0) + m.stdout = "" + return m + raise ValueError(f"Unexpected git command: {args}") + + mock_run.side_effect = run_dispatch + result = run_scan(tmp_path, new_since="HEAD~1") + + # The in-delta caller.f stays ACTIVE in the gate population despite the baseline entry. + assert result.gate_findings is not None + gate_by_qn = {f.qualname: f for f in result.gate_findings if f.kind is Kind.DEFECT} + assert gate_by_qn["caller.f"].suppressed is SuppressionState.ACTIVE + # The out-of-delta unrelated.h is scoped OUT of the gate (delta: unchanged). + assert gate_by_qn["unrelated.h"].suppressed is SuppressionState.BASELINED + # Net: the gate trips on the new defect, and the repo baseline did not clear it. + assert gate_decision(result, Severity.ERROR).tripped is True + + def test_run_scan_counts_unanalyzed_parse_error(tmp_path: Path) -> None: # (b) A file that cannot be parsed is discovered-but-not-analysed: a # Severity.NONE FACT that never trips the severity gate. ScanSummary.unanalyzed diff --git a/tests/unit/mcp/test_server_suppression.py b/tests/unit/mcp/test_server_suppression.py index 9ad3bf24..7bd6d82b 100644 --- a/tests/unit/mcp/test_server_suppression.py +++ b/tests/unit/mcp/test_server_suppression.py @@ -41,6 +41,29 @@ def _call(server: WardlineMCPServer, name: str, arguments: dict) -> dict: return json.loads(resp["result"]["content"][0]["text"]) +def test_mcp_scan_gate_trips_on_baselined_defect_by_default(tmp_path: Path) -> None: + # SECURITY parity with the CLI: a repository-controlled baseline annotates the defect + # but the MCP scan gate evaluates the unsuppressed population by default. + proj = _leaky_project(tmp_path) + server = WardlineMCPServer(root=proj) + first = _call(server, "scan", {}) + fp = next(f["fingerprint"] for f in first["findings"] if f["rule_id"] == "PY-WL-101") + bl = proj / ".wardline" / "baseline.yaml" + bl.parent.mkdir(parents=True, exist_ok=True) + bl.write_text( + f"version: 1\nentries:\n - fingerprint: {fp}\n rule_id: PY-WL-101\n path: svc.py\n message: m\n", + encoding="utf-8", + ) + # Default: annotated baselined, but the gate trips. + default = _call(server, "scan", {"fail_on": "ERROR"}) + leak = next(f for f in default["findings"] if f["rule_id"] == "PY-WL-101") + assert leak["suppressed"] == "baselined" + assert default["gate"]["tripped"] is True + # trust_suppressions restores the trusted-local behaviour: the gate clears. + trusted = _call(server, "scan", {"fail_on": "ERROR", "trust_suppressions": True}) + assert trusted["gate"]["tripped"] is False + + def test_baseline_optional_reason(tmp_path: Path) -> None: proj = _leaky_project(tmp_path) server = WardlineMCPServer(root=proj)