From 9543165447925797a92dea980b9ae5d7252bca43 Mon Sep 17 00:00:00 2001 From: John Morrissey Date: Sat, 6 Jun 2026 12:44:56 +1000 Subject: [PATCH 01/17] fix(legis): --allow-dirty emits an unsigned dev artifact instead of refusing (wardline-30f3d38fa5) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Dogfood friction #1: on a dirty tree `scan --format legis` failed exit 2 naming an `allow_dirty` flag that was never exposed on the CLI — presenting identically to "legis is broken." Expose `--allow-dirty` (CLI) / `allow_dirty` (MCP scan). The honest fix: a dirty tree under allow_dirty does NOT sign. The only tree_sha readable is the *committed* one, which does not describe dirty working content — signing it would be false provenance (the `_git_tree_sha` guard). Instead it falls through to the UNSIGNED dev artifact, clearly marked `dirty: true` (legis records it `unverified`). Signing stays clean-tree-only; verification stays clean-tree/CI. The loud refusal without --allow-dirty is unchanged. CLI emits a stderr warning when the artifact is dirty/unsigned; MCP reports `signed:false` + `dirty:true` in legis_artifact_status. legis ignores the unknown `dirty` top-level key on the unverified path, so ingest is unaffected; the golden clean-tree signature is byte-unchanged. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/wardline/cli/scan.py | 20 ++++++++++ src/wardline/core/legis.py | 44 +++++++++++++++------- src/wardline/mcp/server.py | 22 ++++++++++- tests/unit/cli/test_cli.py | 52 ++++++++++++++++++++++++++ tests/unit/core/test_legis_artifact.py | 17 ++++++++- 5 files changed, 138 insertions(+), 17 deletions(-) diff --git a/src/wardline/cli/scan.py b/src/wardline/cli/scan.py index 560d413b..c1c4eb33 100644 --- a/src/wardline/cli/scan.py +++ b/src/wardline/cli/scan.py @@ -113,6 +113,16 @@ "default the gate evaluates the unsuppressed population so a PR cannot self-suppress." ), ) +@click.option( + "--allow-dirty", + is_flag=True, + default=False, + help=( + "For --format legis only: on a dirty working tree, emit an UNSIGNED, clearly-marked " + "(dirty: true) dev artifact instead of refusing. Signing stays clean-tree-only; this " + "lets the dev/tour loop exercise the Wardline->legis handshake without a commit." + ), +) def scan( path: Path, config_path: Path | None, @@ -131,6 +141,7 @@ def scan( strict_defaults: bool, allow_source_root_escape: bool, trust_suppressions: bool, + allow_dirty: bool, ) -> None: """Scan PATH for findings.""" if fmt == "sarif": @@ -235,8 +246,17 @@ def confirm_cb(rel_path: str, orig: str, replacement: str, f: Finding) -> bool: root=path, config=legis_cfg, key=legis_key.encode("utf-8") if legis_key else None, + allow_dirty=allow_dirty, ) output.write_text(json.dumps(artifact, indent=2, sort_keys=True) + "\n", encoding="utf-8") + # Loud signal: an artifact marked dirty is UNSIGNED (dev/tour only). legis + # records it `unverified`; never gate CI on it. + if artifact.get("dirty"): + click.echo( + "warning: dirty working tree — emitted an UNSIGNED legis dev artifact " + "(dirty: true, legis records it unverified). Commit for a signed artifact.", + err=True, + ) # Weft emission is additive: a FiligreeEmitError (HTTP >= 400) is a Wardline # payload bug -> caught below -> exit 2; an unreachable sibling warns + continues. if filigree_url is not None: diff --git a/src/wardline/core/legis.py b/src/wardline/core/legis.py index cf63d8bd..050df3a5 100644 --- a/src/wardline/core/legis.py +++ b/src/wardline/core/legis.py @@ -220,12 +220,16 @@ def build_legis_artifact( the list — legis enforces its own 500-finding limit and a larger scan is rejected loudly rather than silently truncated. - When ``key`` is given the scan is signed and MUST carry honest provenance - (``scanner_identity``, ``rule_set_version``, ``commit_sha``, ``tree_sha``); signing - a non-repo or dirty tree is refused (:class:`LegisArtifactError`) because a - ``tree_sha`` that does not match the scanned content is false provenance. When - ``key`` is None the scan is emitted unsigned with best-effort provenance — legis - records it as ``unverified`` (the trust-the-agent posture before a key is set). + When ``key`` is given AND the tree is clean the scan is signed and MUST carry + honest provenance (``scanner_identity``, ``rule_set_version``, ``commit_sha``, + ``tree_sha``); signing a non-repo is refused (:class:`LegisArtifactError`). Signing + is clean-tree-only: a dirty tree with a key is refused (:class:`LegisArtifactError`) + UNLESS ``allow_dirty=True``, which does NOT sign — it emits the unsigned dev + artifact instead (a ``tree_sha`` that does not match dirty working content is false + provenance). When ``key`` is None — or a dirty tree under ``allow_dirty`` — the scan + is emitted unsigned with best-effort provenance and a ``dirty: true`` marker on a + dirty tree; legis records it as ``unverified`` (the trust-the-agent posture before a + key is set, and the dev/tour loop without a commit). Sign last, over the otherwise-complete scan: ``artifact_signature`` is added after the rest is in place, exactly as legis verifies (scan-minus-signature). @@ -243,16 +247,19 @@ def build_legis_artifact( } commit, dirty = git_state(root) - if key is not None: + # Signing is CLEAN-TREE-ONLY. A key + clean tree produces the signed, verified + # artifact. A key + dirty tree is refused loudly UNLESS ``allow_dirty`` — and even + # then we do NOT sign: the only ``tree_sha`` we can read is the *committed* tree, + # which does not describe dirty working content, so signing it would be false + # provenance (see :func:`_git_tree_sha`). Instead ``allow_dirty`` falls through to + # the unsigned dev artifact below, clearly marked ``dirty: true`` (legis records it + # ``unverified``). This lets the dev/tour loop exercise the full Wardline→legis + # handshake without a commit, while keeping signature *verification* clean-tree-only. + if key is not None and not dirty: if commit is None: raise LegisArtifactError( "cannot sign legis artifact: not a git repository, so commit/tree provenance is unavailable" ) - if dirty and not allow_dirty: - raise LegisArtifactError( - "refusing to sign a legis artifact for a dirty working tree " - "(uncommitted changes); commit first or pass allow_dirty" - ) tree = _git_tree_sha(root) if tree is None: raise LegisArtifactError("cannot sign legis artifact: tree SHA unavailable") @@ -260,12 +267,21 @@ def build_legis_artifact( scan["tree_sha"] = tree scan[ARTIFACT_SIGNATURE_FIELD] = sign_artifact(scan, key) return scan + if key is not None and dirty and not allow_dirty: + raise LegisArtifactError( + "refusing to sign a legis artifact for a dirty working tree " + "(uncommitted changes); commit first or pass allow_dirty for an unsigned dev artifact" + ) - # Unsigned: supply whatever provenance we can honestly read; legis marks it - # unverified. Never fabricate a tree_sha — omit it if unreadable. + # Unsigned (no key, or key + allow_dirty on a dirty tree): supply whatever + # provenance we can honestly read; legis marks it unverified. Never fabricate a + # tree_sha — omit it if unreadable. A dirty tree is flagged so neither the agent + # nor a human mistakes the committed provenance for the scanned working content. if commit is not None: scan["commit_sha"] = commit tree = _git_tree_sha(root) if tree is not None: scan["tree_sha"] = tree + if dirty: + scan["dirty"] = True return scan diff --git a/src/wardline/mcp/server.py b/src/wardline/mcp/server.py index 7c8418d6..e37a73b5 100644 --- a/src/wardline/mcp/server.py +++ b/src/wardline/mcp/server.py @@ -323,6 +323,7 @@ def _attach_legis_artifact( strict_defaults=strict_defaults, ) key_bytes = key_str.encode("utf-8") if key_str else None + allow_dirty = bool(args.get("allow_dirty") or False) status: dict[str, Any] = { "configured": True, "signed": False, @@ -330,12 +331,17 @@ def _attach_legis_artifact( "reason": None, } try: - artifact = build_legis_artifact(result, root=path, config=cfg, key=key_bytes) + artifact = build_legis_artifact(result, root=path, config=cfg, key=key_bytes, allow_dirty=allow_dirty) except LegisArtifactError as exc: status["reason"] = str(exc) response["legis_artifact_status"] = status return - status["signed"] = key_bytes is not None + # A dirty tree under allow_dirty falls through to the unsigned dev artifact: it is + # never signed even with a key present (false-provenance guard), and legis records + # it `unverified`. Report signed honestly from the artifact, not from key presence. + dirty = bool(artifact.get("dirty")) + status["signed"] = key_bytes is not None and not dirty + status["dirty"] = dirty response["legis_artifact"] = artifact response["legis_artifact_status"] = status @@ -795,6 +801,18 @@ def _register_tools(self) -> None: "evaluates the unsuppressed population so a PR cannot self-suppress its " "own defect. Use only on a trusted checkout; in CI prefer new_since.", }, + "legis_artifact": { + "type": "boolean", + "description": "Attach the verbatim-postable legis scan-artifact " + "(`legis_artifact` block) even when no signing key is provisioned " + "(unsigned, for legis's optional-verify posture).", + }, + "allow_dirty": { + "type": "boolean", + "description": "For the legis artifact only: on a dirty tree emit an UNSIGNED, " + "clearly-marked (dirty: true) dev artifact instead of refusing to sign. " + "Signing stays clean-tree-only; legis records it unverified.", + }, }, }, handler=lambda args, root: _scan( diff --git a/tests/unit/cli/test_cli.py b/tests/unit/cli/test_cli.py index 1902a4c1..924728f3 100644 --- a/tests/unit/cli/test_cli.py +++ b/tests/unit/cli/test_cli.py @@ -92,6 +92,58 @@ def test_scan_default_output_lands_in_scanned_path(tmp_path: Path) -> None: assert (project / "findings.jsonl").exists() +def _git(repo: Path, *args: str) -> None: + import subprocess + + subprocess.run(["git", *args], cwd=repo, check=True, capture_output=True) + + +def _legis_committed_repo(tmp_path: Path) -> Path: + import shutil + + repo = tmp_path / "proj" + shutil.copytree(FIXTURE, repo) + _git(repo, "init", "-q") + _git(repo, "config", "user.email", "t@example.com") + _git(repo, "config", "user.name", "t") + _git(repo, "add", "-A") + _git(repo, "commit", "-qm", "init") + return repo + + +def test_scan_format_legis_dirty_tree_refuses_without_allow_dirty(tmp_path: Path) -> None: + # With a signing key + dirty tree and NO --allow-dirty, the CLI is loud (exit 2): + # this is the friction the dogfood report flagged, kept as the default. + repo = _legis_committed_repo(tmp_path) + (repo / "svc.py").write_text("# dirty edit\n", encoding="utf-8") + out = tmp_path / "scan.legis.json" + result = CliRunner().invoke( + cli, + ["scan", str(repo), "--format", "legis", "--output", str(out)], + env={"WARDLINE_LEGIS_ARTIFACT_KEY": "devkey"}, + ) + assert result.exit_code == 2 + assert "dirty working tree" in result.output + + +def test_scan_format_legis_allow_dirty_emits_unsigned_marked_artifact(tmp_path: Path) -> None: + # --allow-dirty turns the refusal into an UNSIGNED, clearly-marked dev artifact so + # the dev/tour loop can exercise the Wardline->legis handshake without a commit. + repo = _legis_committed_repo(tmp_path) + (repo / "svc.py").write_text("# dirty edit\n", encoding="utf-8") + out = tmp_path / "scan.legis.json" + result = CliRunner().invoke( + cli, + ["scan", str(repo), "--format", "legis", "--output", str(out), "--allow-dirty"], + env={"WARDLINE_LEGIS_ARTIFACT_KEY": "devkey"}, + ) + assert result.exit_code == 0 + artifact = _json.loads(out.read_text(encoding="utf-8")) + assert "artifact_signature" not in artifact + assert artifact["dirty"] is True + assert "UNSIGNED legis dev artifact" in result.output + + def test_scan_config_error_exits_2(tmp_path: Path) -> None: import shutil diff --git a/tests/unit/core/test_legis_artifact.py b/tests/unit/core/test_legis_artifact.py index dfda9b12..e3820894 100644 --- a/tests/unit/core/test_legis_artifact.py +++ b/tests/unit/core/test_legis_artifact.py @@ -259,10 +259,25 @@ def test_signing_refuses_dirty_tree(tmp_path) -> None: _build(repo, key=b"k") -def test_allow_dirty_signs_anyway(tmp_path) -> None: +def test_allow_dirty_emits_unsigned_marked_artifact(tmp_path) -> None: + # The honest fix for the dogfood #1 friction: a dirty tree with allow_dirty does + # NOT sign (signing the committed tree_sha for dirty working content is false + # provenance — see _git_tree_sha). It emits an UNSIGNED, clearly-marked dev + # artifact instead: no signature, dirty:true, legis records it `unverified`. repo = _committed_repo(tmp_path) (repo / "svc.py").write_text(_LEAKY + "\n# dirty\n", encoding="utf-8") scan = _build(repo, key=b"k", allow_dirty=True) + assert "artifact_signature" not in scan + assert scan["dirty"] is True + # best-effort provenance (HEAD commit) is still honestly recorded + assert scan["commit_sha"] + + +def test_clean_signed_artifact_has_no_dirty_marker(tmp_path) -> None: + # A clean tree signs as before and carries no dirty marker — the signed wire is + # byte-unchanged (guards the golden-signature contract). + scan = _build(_committed_repo(tmp_path), key=b"k") + assert "dirty" not in scan assert scan["artifact_signature"].startswith("hmac-sha256:v2:") From 77cbbd9c43178f89f64da8a46e7476eaffc21461 Mon Sep 17 00:00:00 2001 From: John Morrissey Date: Sat, 6 Jun 2026 12:48:56 +1000 Subject: [PATCH 02/17] feat(gate): verdict carries a human reason + evaluated population (wardline-be75c6676d) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Dogfood friction #2: a scan reporting summary.active:0 AND gate.tripped:true read as a bug — the agent had to run scan twice (with/without trust_suppressions) and read --help to learn the gate evaluates the unsuppressed (baselined-included) population by default. GateDecision now carries `reason` and `evaluated`. `reason` names the count and class that decided the verdict — "1 suppressed ERROR+ defect(s) (baseline/waiver/ judged) not cleared; pass --trust-suppressions (trusted checkout) or --new-since (PR)" when the trip is solely from suppressed-but-gated findings, "N active ERROR+ defect(s)" on a genuine trip (no misdirection to the suppression flags), and the mixed form when both. `evaluated` names the population: "unsuppressed (repository baseline/waiver/judged ignored)" by default, "post-suppression … honored" under --trust-suppressions. Counts come from `gate_breakdown` over the ANNOTATED findings so they match what the agent reads in `summary`. Surfaced in the MCP scan gate block, the agent_summary gate block, and on CLI stderr when the gate trips (never a silent exit 1). Both None when no --fail-on. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/wardline/cli/scan.py | 7 +++- src/wardline/core/agent_summary.py | 2 ++ src/wardline/core/run.py | 47 ++++++++++++++++++++++++-- src/wardline/core/suppression.py | 27 +++++++++++++++ src/wardline/mcp/server.py | 8 ++++- tests/unit/cli/test_cli.py | 20 +++++++++++ tests/unit/core/test_agent_summary.py | 16 +++++++++ tests/unit/core/test_cli_mcp_parity.py | 2 ++ tests/unit/core/test_run.py | 37 ++++++++++++++++++++ 9 files changed, 162 insertions(+), 4 deletions(-) diff --git a/src/wardline/cli/scan.py b/src/wardline/cli/scan.py index c1c4eb33..37134418 100644 --- a/src/wardline/cli/scan.py +++ b/src/wardline/cli/scan.py @@ -344,7 +344,12 @@ def confirm_cb(rel_path: str, orig: str, replacement: str, f: Finding) -> bool: f"(see WLN-ENGINE-* facts in {output}).", err=True, ) - gate_tripped = fail_on is not None and gate_decision(result, Severity(fail_on)).tripped + decision = gate_decision(result, Severity(fail_on)) if fail_on is not None else None + gate_tripped = decision is not None and decision.tripped + if decision is not None and decision.tripped: + # Never let "0 active + gate FAILED" read as a bug: say why and which population. + click.echo(f"gate: FAILED (--fail-on {decision.fail_on}) — {decision.reason}", err=True) + click.echo(f"gate: evaluated {decision.evaluated}", err=True) # Independent of the severity gate: opt-in enforcement of "everything analysed". if gate_tripped or (fail_on_unanalyzed and s.unanalyzed): raise SystemExit(1) diff --git a/src/wardline/core/agent_summary.py b/src/wardline/core/agent_summary.py index 44a6ff37..6405ebeb 100644 --- a/src/wardline/core/agent_summary.py +++ b/src/wardline/core/agent_summary.py @@ -68,6 +68,8 @@ def to_dict(self) -> dict[str, Any]: "tripped": self.gate.tripped, "fail_on": self.gate.fail_on, "exit_class": self.gate.exit_class, + "reason": self.gate.reason, + "evaluated": self.gate.evaluated, }, "integrations": { "filigree_emit": dict(self.filigree_emit), diff --git a/src/wardline/core/run.py b/src/wardline/core/run.py index a1100222..2978f06c 100644 --- a/src/wardline/core/run.py +++ b/src/wardline/core/run.py @@ -82,6 +82,13 @@ class GateDecision: tripped: bool fail_on: str | None exit_class: int # 0 clean, 1 gate tripped, 2 reserved for tool errors (CLI layer) + # A human-readable verdict so "summary.active:0 + gate.tripped:true" never reads as + # a bug: ``reason`` names the count and class of defects that decided it (and the + # escape hatches when the trip is solely from suppressed-but-gated findings); + # ``evaluated`` names the population it judged (unsuppressed by default vs honored + # under --trust-suppressions). Both None when no threshold is set (no gate). + reason: str | None = None + evaluated: str | None = None def run_scan( @@ -280,6 +287,42 @@ def gate_decision(result: ScanResult, fail_on: Severity | None) -> GateDecision: # None SENTINEL: evaluate the unsuppressed gate population when present (secure # default), else the suppressed ``findings`` (trusted ``--trust-suppressions`` / # a directly-constructed ScanResult with no gate_findings). - gate_population = result.gate_findings if result.gate_findings is not None else result.findings + honors_suppressions = result.gate_findings is None + gate_population = result.findings if honors_suppressions else result.gate_findings + assert gate_population is not None # narrow for mypy; the sentinel branch set findings tripped = gate_trips(gate_population, fail_on) - return GateDecision(tripped=tripped, fail_on=fail_on.value, exit_class=1 if tripped else 0) + sev = fail_on.value + evaluated = ( + "post-suppression (repository baseline/waiver/judged honored — trusted-local)" + if honors_suppressions + else "unsuppressed (repository baseline/waiver/judged ignored)" + ) + reason = _gate_reason(result, fail_on, tripped=tripped, honors_suppressions=honors_suppressions) + return GateDecision( + tripped=tripped, + fail_on=sev, + exit_class=1 if tripped else 0, + reason=reason, + evaluated=evaluated, + ) + + +def _gate_reason(result: ScanResult, fail_on: Severity, *, tripped: bool, honors_suppressions: bool) -> str: + """The human verdict string. Counts the ANNOTATED population (``result.findings``) + so the numbers match what the agent reads in ``summary``.""" + from wardline.core.suppression import gate_breakdown + + sev = fail_on.value + active, suppressed = gate_breakdown(result.findings, fail_on) + if not tripped: + return f"no {sev}+ defects in the evaluated population" + # Under --trust-suppressions the suppressed defects are honored (cleared), so only + # active ones can have tripped the gate; never misdirect to the suppression flags. + if honors_suppressions: + return f"{active} active {sev}+ defect(s) at or above {sev}" + escape = "pass --trust-suppressions (trusted checkout) or --new-since (PR)" + if active and suppressed: + return f"{active} active + {suppressed} suppressed {sev}+ defect(s) gate by default; {escape}" + if suppressed: + return f"{suppressed} suppressed {sev}+ defect(s) (baseline/waiver/judged) not cleared; {escape}" + return f"{active} active {sev}+ defect(s) at or above {sev}" diff --git a/src/wardline/core/suppression.py b/src/wardline/core/suppression.py index ca3090a8..197e7b75 100644 --- a/src/wardline/core/suppression.py +++ b/src/wardline/core/suppression.py @@ -85,3 +85,30 @@ def gate_trips(findings: Iterable[Finding], fail_on: Severity) -> bool: if rank is not None and rank >= threshold: return True return False + + +def gate_breakdown(findings: Iterable[Finding], fail_on: Severity) -> tuple[int, int]: + """Count gate-relevant DEFECTs at/above ``fail_on`` in the ANNOTATED population, + split into ``(active, suppressed)``. + + Same predicate as :func:`gate_trips` (DEFECT, non-PREVIEW, severity >= threshold) + but counts instead of short-circuiting and partitions by whether the finding is + ACTIVE or repository-suppressed (baselined / waived / judged). Lets the gate verdict + say *which* population tripped it without re-deriving the rule. Under the secure + default the suppressed count is exactly the set that gates only because suppressions + are ignored — the number an agent clears with ``--trust-suppressions``/``--new-since``. + """ + threshold = _RANK[fail_on] + active = 0 + suppressed = 0 + for f in findings: + if f.kind is not Kind.DEFECT or f.maturity == Maturity.PREVIEW: + continue + rank = _RANK.get(f.severity) + if rank is None or rank < threshold: + continue + if f.suppressed is SuppressionState.ACTIVE: + active += 1 + else: + suppressed += 1 + return active, suppressed diff --git a/src/wardline/mcp/server.py b/src/wardline/mcp/server.py index e37a73b5..b962345b 100644 --- a/src/wardline/mcp/server.py +++ b/src/wardline/mcp/server.py @@ -263,7 +263,13 @@ def _scan( # silent under-scan reaches the agent, not just the human-facing stderr. "unanalyzed": result.summary.unanalyzed, }, - "gate": {"tripped": decision.tripped, "fail_on": decision.fail_on, "exit_class": decision.exit_class}, + "gate": { + "tripped": decision.tripped, + "fail_on": decision.fail_on, + "exit_class": decision.exit_class, + "reason": decision.reason, + "evaluated": decision.evaluated, + }, "loomweave": loomweave_block, "filigree": filigree_block, "loomweave_write": loomweave_status, diff --git a/tests/unit/cli/test_cli.py b/tests/unit/cli/test_cli.py index 924728f3..0c832339 100644 --- a/tests/unit/cli/test_cli.py +++ b/tests/unit/cli/test_cli.py @@ -144,6 +144,26 @@ def test_scan_format_legis_allow_dirty_emits_unsigned_marked_artifact(tmp_path: assert "UNSIGNED legis dev artifact" in result.output +_LEAKY_SRC = ( + "from wardline.decorators import external_boundary, trusted\n" + "@external_boundary\ndef raw(p):\n return p\n" + "@trusted\ndef leaky(p):\n return raw(p)\n" +) + + +def test_scan_gate_trip_prints_reason_and_population(tmp_path: Path) -> None: + # A tripped gate must say WHY on stderr — never just exit 1 silently (dogfood #2). + project = tmp_path / "proj" + project.mkdir() + (project / "svc.py").write_text(_LEAKY_SRC, encoding="utf-8") + out = tmp_path / "o.jsonl" + result = CliRunner().invoke(cli, ["scan", str(project), "--fail-on", "ERROR", "--output", str(out)]) + assert result.exit_code == 1 + assert "gate: FAILED (--fail-on ERROR)" in result.output + assert "1 active" in result.output + assert "gate: evaluated" in result.output + + def test_scan_config_error_exits_2(tmp_path: Path) -> None: import shutil diff --git a/tests/unit/core/test_agent_summary.py b/tests/unit/core/test_agent_summary.py index bcb54076..9c6972a5 100644 --- a/tests/unit/core/test_agent_summary.py +++ b/tests/unit/core/test_agent_summary.py @@ -48,6 +48,22 @@ def test_agent_summary_active_defects_first_and_stable(tmp_path: Path) -> None: assert defect["next_tool_calls"][0]["tool"] == "explain_taint" +def test_agent_summary_gate_block_carries_reason_and_evaluated(tmp_path: Path) -> None: + # The dogfood #2 fix must reach the agent_summary gate block, not just the MCP scan + # top-level: a baselined-only scan that trips must SAY why and which population. + (tmp_path / "svc.py").write_text(_LEAKY, encoding="utf-8") + scan = run_scan(tmp_path) + fp = next(f.fingerprint for f in scan.findings if f.rule_id == "PY-WL-101") + bl = tmp_path / ".wardline" / "baseline.yaml" + bl.parent.mkdir(parents=True, exist_ok=True) + write_baseline(bl, [next(f for f in scan.findings if f.fingerprint == fp)]) + rescan = run_scan(tmp_path) + out = build_agent_summary(rescan, gate_decision(rescan, Severity.ERROR)).to_dict() + assert out["gate"]["tripped"] is True + assert "suppressed" in out["gate"]["reason"] + assert "unsuppressed" in out["gate"]["evaluated"] + + def test_agent_summary_no_active_defects_still_has_next_actions(tmp_path: Path) -> None: (tmp_path / "svc.py").write_text("def f():\n return 1\n", encoding="utf-8") scan = run_scan(tmp_path) diff --git a/tests/unit/core/test_cli_mcp_parity.py b/tests/unit/core/test_cli_mcp_parity.py index c3dc6995..f2ac999a 100644 --- a/tests/unit/core/test_cli_mcp_parity.py +++ b/tests/unit/core/test_cli_mcp_parity.py @@ -37,6 +37,8 @@ def test_cli_and_mcp_scan_agree_on_findings_and_gate() -> None: "tripped": cli_gate.tripped, "fail_on": cli_gate.fail_on, "exit_class": cli_gate.exit_class, + "reason": cli_gate.reason, + "evaluated": cli_gate.evaluated, } assert mcp["summary"]["total"] == cli_result.summary.total assert mcp["summary"]["active"] == cli_result.summary.active diff --git a/tests/unit/core/test_run.py b/tests/unit/core/test_run.py index 11f7a47d..b1fa6203 100644 --- a/tests/unit/core/test_run.py +++ b/tests/unit/core/test_run.py @@ -199,6 +199,43 @@ def test_trust_suppressions_restores_old_gate_clearing(tmp_path: Path, writer) - assert gate_decision(result, Severity.ERROR).tripped is False +def test_gate_decision_reason_names_suppressed_population_on_default_trip(tmp_path: Path) -> None: + # The dogfood #2 confusion: summary.active:0 + gate.tripped:true. The verdict must + # SAY why — name the suppressed-but-gated count and the escape hatches — and name the + # population it judged, so the agent does not have to run scan twice to infer it. + proj, fp = _leaky_proj(tmp_path) + _write_baseline(proj, fp) + decision = gate_decision(run_scan(proj), Severity.ERROR) + assert decision.tripped is True + assert decision.reason is not None + assert "1 suppressed" in decision.reason + assert "--trust-suppressions" in decision.reason and "--new-since" in decision.reason + assert decision.evaluated is not None and "unsuppressed" in decision.evaluated + + +def test_gate_decision_reason_names_active_defect_on_genuine_trip(tmp_path: Path) -> None: + proj, _ = _leaky_proj(tmp_path) # no suppression -> a genuinely active defect + decision = gate_decision(run_scan(proj), Severity.ERROR) + assert decision.tripped is True + assert decision.reason is not None and "1 active" in decision.reason + # a genuine active trip should NOT misdirect the agent to the suppression flags + assert "--trust-suppressions" not in decision.reason + + +def test_gate_decision_evaluated_reflects_trust_suppressions(tmp_path: Path) -> None: + proj, fp = _leaky_proj(tmp_path) + _write_baseline(proj, fp) + decision = gate_decision(run_scan(proj, trust_suppressions=True), Severity.ERROR) + assert decision.tripped is False + assert decision.evaluated is not None and "honored" in decision.evaluated + + +def test_gate_decision_no_threshold_has_no_reason() -> None: + result = ScanResult(findings=[], summary=ScanSummary(0, 0, 0, 0, 0), files_scanned=0, context=None) + decision = gate_decision(result, None) + assert decision.reason is None and decision.evaluated is None + + def test_gate_findings_is_unsuppressed_population(tmp_path: Path) -> None: proj, fp = _leaky_proj(tmp_path) _write_baseline(proj, fp) From b0e9dba0457ebbf18abf3a8ea3953a513b67ebba Mon Sep 17 00:00:00 2001 From: John Morrissey Date: Sat, 6 Jun 2026 12:53:15 +1000 Subject: [PATCH 03/17] feat(gate): loud baseline-migration signal + UPGRADING/CHANGELOG (wardline-5f662e7a4f) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Dogfood friction #3: the secure gate-default (gate on the unsuppressed population) is correct, but the rollout was silent — a repo whose committed baseline used to clear --fail-on goes red with no code change, and an agent can't tell whether IT broke scan or HEAD was already red. New `baseline_migration_hint`: fires ONLY in the exact 'my repo went red with no code change' case — a committed .wardline/baseline.yaml exists, the gate trips SOLELY because baselined defects re-enter the unsuppressed population (no genuinely-active defect, no waiver/judged-only trip), and neither --trust-suppressions nor --new-since was passed. It points at both escape hatches and UPGRADING.md. Silent on a genuine active trip, a trusted/PR-scoped run, or no baseline file. Surfaced loudly on CLI stderr and as MCP `scan` gate.migration_hint (None otherwise). New UPGRADING.md documents the secure-default migration; CHANGELOG [Unreleased] gains entries for dogfood #1/#2/#3. Secure default unchanged. Co-Authored-By: Claude Opus 4.8 (1M context) --- CHANGELOG.md | 30 ++++++++++++++ UPGRADING.md | 46 +++++++++++++++++++++ src/wardline/cli/scan.py | 17 +++++--- src/wardline/core/run.py | 55 +++++++++++++++++++++++++ src/wardline/mcp/server.py | 4 +- tests/unit/cli/test_cli.py | 21 ++++++++++ tests/unit/core/test_cli_mcp_parity.py | 4 +- tests/unit/core/test_run.py | 57 +++++++++++++++++++++++++- 8 files changed, 225 insertions(+), 9 deletions(-) create mode 100644 UPGRADING.md diff --git a/CHANGELOG.md b/CHANGELOG.md index 923d237e..83428657 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,7 +7,37 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added +- **The `--fail-on` gate verdict now explains itself (dogfood friction #2/#3).** A scan + reporting `summary.active: 0` while `gate.tripped: true` no longer reads as a bug. The + gate block (CLI stderr, MCP `scan` result, and the agent-summary) carries a human + `reason` — e.g. `"34 suppressed ERROR+ defect(s) (baseline/waiver/judged) not cleared; + pass --trust-suppressions (trusted checkout) or --new-since (PR)"` for a + suppressed-only trip, `"N active ERROR+ defect(s)"` for a genuine one (no misdirection + to the suppression flags) — and an `evaluated` string naming the judged population + (`unsuppressed …` by default vs `post-suppression … honored` under + `--trust-suppressions`). Counts come from the annotated findings, so they match + `summary`. +- **Loud migration signal for the secure gate-default rollout (dogfood friction #3).** + When a committed `.wardline/baseline.yaml` exists, the gate trips **solely** because + baselined defects re-enter the unsuppressed population, and neither + `--trust-suppressions` nor `--new-since` was passed, Wardline now prints a one-line + `migration:` hint (CLI stderr; MCP `scan` `gate.migration_hint`) pointing at the + escape hatches and the new **`UPGRADING.md`**. This is the "my repo went red with no + code change" case made self-explaining; the secure default itself is unchanged. + ### Fixed +- **`scan --format legis --allow-dirty` emits an unsigned dev artifact instead of + refusing (dogfood friction #1).** On a dirty working tree `scan --format legis` + failed `exit 2` naming an `allow_dirty` flag that was never exposed — presenting + identically to "legis is broken," the session's single biggest rabbit hole. The flag + is now exposed (`--allow-dirty` CLI / `allow_dirty` MCP `scan`). The honest fix: a + dirty tree under `--allow-dirty` does **not** sign — the only readable `tree_sha` is + the *committed* one, which does not describe dirty working content, so signing it + would be false provenance. It falls through to the **unsigned** dev artifact, clearly + marked `dirty: true` (legis records it `unverified`). Signing stays clean-tree-only; + the loud refusal without `--allow-dirty` is unchanged. Lets the dev/tour loop exercise + the Wardline→legis handshake without a commit. - **Loomweave HMAC signer resync (auth path was 401ing every signed request).** Wardline's request signature drifted from Loomweave's verifier (ADR-042): the canonical message is now `METHOD\nPATH\nSHA256HEX(body)\nTIMESTAMP\nNONCE` (the diff --git a/UPGRADING.md b/UPGRADING.md new file mode 100644 index 00000000..0902549e --- /dev/null +++ b/UPGRADING.md @@ -0,0 +1,46 @@ +# Upgrading Wardline + +Migration notes for changes that can alter a previously-green run. Newest first. + +## To v1.0 — the `--fail-on` gate no longer honors committed suppressions by default + +**What changed.** `.wardline/baseline.yaml`, `wardline.yaml` waivers, and +`.wardline/judged.yaml` are all committed repository content, so a malicious pull +request could add a suppression entry keyed to its own new defect's fingerprint and +clear the gate. The `--fail-on` gate now evaluates the **unsuppressed** population by +default: baseline / waiver / judged still **annotate** the emitted findings +(`suppressed: baselined | waived | judged`) but no longer clear the gate. + +**Symptom on upgrade.** A repository whose committed baseline used to clear +`wardline scan --fail-on=ERROR` goes **red with no change to its own code**, because +the baselined defects re-enter the gate population. Wardline now says so out loud — a +clean run that trips solely on baselined findings (and was given neither +`--trust-suppressions` nor `--new-since`) prints: + +``` +migration: baseline present but not honored by default since v1.0 (secure gate default) — +N baselined ERROR+ defect(s) re-enter the gate. Pass --trust-suppressions for a trusted +local checkout or --new-since in CI. See UPGRADING.md. +``` + +The same signal rides the MCP `scan` result at `gate.migration_hint`, and the gate +block always carries a `reason` and the `evaluated` population so "0 active + gate +FAILED" never reads as a bug. + +**How to restore a passing gate.** Pick the one that matches your trust posture: + +- **CI (recommended): `--new-since `.** Scopes both the emitted findings + and the gate to what changed since the ref — an operator-supplied, unforgeable + ratchet a PR cannot tamper with. A baselined defect that is *not* in the diff stops + gating; a brand-new defect still trips. +- **Trusted local checkout: `--trust-suppressions`** (CLI) / `trust_suppressions: true` + (MCP `scan`). Restores the old post-suppression gate. Use **only** where the + suppression files are trusted — never to enforce on untrusted PR content. This is + what the `judge` workflow uses internally. + +Keeping the baseline up to date (`wardline baseline update`) and clearing real debt is +the durable fix; the flags above are the migration bridge. + +**Not affected.** legis's scan artifact and the "one judge / reproduces Wardline's gate +population exactly" property are derived from the gate population, so they already +reflect the secure view. Only the local `--fail-on` exit code changed. diff --git a/src/wardline/cli/scan.py b/src/wardline/cli/scan.py index 37134418..99e345e2 100644 --- a/src/wardline/cli/scan.py +++ b/src/wardline/cli/scan.py @@ -13,7 +13,7 @@ from wardline.core.errors import WardlineError from wardline.core.filigree_emit import EmitResult, FiligreeEmitter from wardline.core.finding import Severity -from wardline.core.run import gate_decision, run_scan +from wardline.core.run import baseline_migration_hint, gate_decision, run_scan from wardline.core.sarif import SarifSink @@ -344,12 +344,17 @@ def confirm_cb(rel_path: str, orig: str, replacement: str, f: Finding) -> bool: f"(see WLN-ENGINE-* facts in {output}).", err=True, ) - decision = gate_decision(result, Severity(fail_on)) if fail_on is not None else None - gate_tripped = decision is not None and decision.tripped - if decision is not None and decision.tripped: + gate_dec = gate_decision(result, Severity(fail_on)) if fail_on is not None else None + gate_tripped = gate_dec is not None and gate_dec.tripped + if gate_dec is not None and gate_dec.tripped: # Never let "0 active + gate FAILED" read as a bug: say why and which population. - click.echo(f"gate: FAILED (--fail-on {decision.fail_on}) — {decision.reason}", err=True) - click.echo(f"gate: evaluated {decision.evaluated}", err=True) + click.echo(f"gate: FAILED (--fail-on {gate_dec.fail_on}) — {gate_dec.reason}", err=True) + click.echo(f"gate: evaluated {gate_dec.evaluated}", err=True) + # The secure-gate-default rollout signal: a committed baseline that used to clear + # the gate now re-enters it. Loud + separable from the generic reason above. + hint = baseline_migration_hint(result, gate_dec, root=path, new_since=new_since) + if hint is not None: + click.echo(hint, err=True) # Independent of the severity gate: opt-in enforcement of "everything analysed". if gate_tripped or (fail_on_unanalyzed and s.unanalyzed): raise SystemExit(1) diff --git a/src/wardline/core/run.py b/src/wardline/core/run.py index 2978f06c..edf82751 100644 --- a/src/wardline/core/run.py +++ b/src/wardline/core/run.py @@ -24,6 +24,7 @@ Finding, Kind, Location, + Maturity, Severity, SuppressionState, ) @@ -307,6 +308,60 @@ def gate_decision(result: ScanResult, fail_on: Severity | None) -> GateDecision: ) +def baseline_migration_hint( + result: ScanResult, + decision: GateDecision, + *, + root: Path, + new_since: str | None, +) -> str | None: + """A LOUD one-line migration signal for the secure gate-default rollout, or None. + + Returns the hint ONLY in the exact 'my repo went red with no code change' case: + a committed ``.wardline/baseline.yaml`` exists, the gate tripped, the trip is + driven SOLELY by baselined defects re-entering the unsuppressed population (no + genuinely-active defect), and the operator passed neither ``--trust-suppressions`` + nor ``--new-since``. Otherwise None — a genuine active trip, a waiver/judged-only + trip, a trusted/PR-scoped run, or no baseline file are all NOT the rollout surprise. + """ + if not decision.tripped or decision.fail_on is None or new_since is not None: + return None + # --trust-suppressions honors the baseline, so there is no surprise to migrate from. + if result.gate_findings is None: + return None + if not (root / ".wardline" / "baseline.yaml").is_file(): + return None + from wardline.core.suppression import gate_breakdown + + fail_on = Severity(decision.fail_on) + active, _suppressed = gate_breakdown(result.findings, fail_on) + if active: + return None # a real active defect tripped it — not a migration artifact + baselined = sum( + 1 + for f in result.findings + if f.kind is Kind.DEFECT + and f.suppressed is SuppressionState.BASELINED + and f.maturity is not Maturity.PREVIEW + and _gates(f.severity, fail_on) + ) + if not baselined: + return None # tripped by waived/judged only — different escape, not this hint + sev = decision.fail_on + return ( + f"migration: baseline present but not honored by default since v1.0 (secure gate default) — " + f"{baselined} baselined {sev}+ defect(s) re-enter the gate. Pass --trust-suppressions for a " + f"trusted local checkout or --new-since in CI. See UPGRADING.md." + ) + + +def _gates(severity: Severity, fail_on: Severity) -> bool: + from wardline.core.suppression import _RANK + + rank = _RANK.get(severity) + return rank is not None and rank >= _RANK[fail_on] + + def _gate_reason(result: ScanResult, fail_on: Severity, *, tripped: bool, honors_suppressions: bool) -> str: """The human verdict string. Counts the ANNOTATED population (``result.findings``) so the numbers match what the agent reads in ``summary``.""" diff --git a/src/wardline/mcp/server.py b/src/wardline/mcp/server.py index b962345b..e051c0fc 100644 --- a/src/wardline/mcp/server.py +++ b/src/wardline/mcp/server.py @@ -26,7 +26,7 @@ from wardline.core.finding import Finding, Kind, Severity, SuppressionState from wardline.core.finding_query import filter_findings from wardline.core.judge_run import run_judge -from wardline.core.run import gate_decision, run_scan +from wardline.core.run import baseline_migration_hint, gate_decision, run_scan from wardline.core.safe_paths import safe_project_file from wardline.core.sei_resolution import resolve_query_filters from wardline.core.waivers import add_waiver, parse_waivers @@ -223,6 +223,7 @@ def _scan( "disabled_reason": wr.disabled_reason, } decision = gate_decision(result, threshold) + migration_hint = baseline_migration_hint(result, decision, root=path, new_since=new_since) filigree_block = _emit_filigree(result.findings, filigree, scanned_paths=result.scanned_paths) filigree_status = _filigree_emit_status(filigree_block) loomweave_status = _loomweave_write_status(loomweave_block) @@ -269,6 +270,7 @@ def _scan( "exit_class": decision.exit_class, "reason": decision.reason, "evaluated": decision.evaluated, + "migration_hint": migration_hint, }, "loomweave": loomweave_block, "filigree": filigree_block, diff --git a/tests/unit/cli/test_cli.py b/tests/unit/cli/test_cli.py index 0c832339..04ceb35f 100644 --- a/tests/unit/cli/test_cli.py +++ b/tests/unit/cli/test_cli.py @@ -164,6 +164,27 @@ def test_scan_gate_trip_prints_reason_and_population(tmp_path: Path) -> None: assert "gate: evaluated" in result.output +def test_scan_baselined_only_trip_prints_migration_hint(tmp_path: Path) -> None: + # Dogfood #3: a committed baseline that used to clear the gate now re-enters it. + # The CLI must emit the loud one-line migration signal, not just exit 1. + from wardline.core.baseline import write_baseline + from wardline.core.run import run_scan as _run_scan + + project = tmp_path / "proj" + project.mkdir() + (project / "svc.py").write_text(_LEAKY_SRC, encoding="utf-8") + scan = _run_scan(project) + leak = next(f for f in scan.findings if f.rule_id == "PY-WL-101") + bl = project / ".wardline" / "baseline.yaml" + bl.parent.mkdir(parents=True, exist_ok=True) + write_baseline(bl, [leak]) + out = tmp_path / "o.jsonl" + result = CliRunner().invoke(cli, ["scan", str(project), "--fail-on", "ERROR", "--output", str(out)]) + assert result.exit_code == 1 + assert "migration: baseline present but not honored by default since v1.0" in result.output + assert "UPGRADING.md" in result.output + + def test_scan_config_error_exits_2(tmp_path: Path) -> None: import shutil diff --git a/tests/unit/core/test_cli_mcp_parity.py b/tests/unit/core/test_cli_mcp_parity.py index f2ac999a..4ec93e83 100644 --- a/tests/unit/core/test_cli_mcp_parity.py +++ b/tests/unit/core/test_cli_mcp_parity.py @@ -17,7 +17,7 @@ from pathlib import Path from wardline.core.finding import Severity -from wardline.core.run import gate_decision, run_scan +from wardline.core.run import baseline_migration_hint, gate_decision, run_scan from wardline.mcp.server import _finding_to_dict, _scan _CORPUS = Path(__file__).resolve().parents[3] / "tests" / "corpus" / "fixtures" @@ -33,12 +33,14 @@ def test_cli_and_mcp_scan_agree_on_findings_and_gate() -> None: mcp = _scan({"fail_on": "ERROR"}, root=_CORPUS) assert mcp["findings"] == cli_findings + cli_hint = baseline_migration_hint(cli_result, cli_gate, root=_CORPUS, new_since=None) assert mcp["gate"] == { "tripped": cli_gate.tripped, "fail_on": cli_gate.fail_on, "exit_class": cli_gate.exit_class, "reason": cli_gate.reason, "evaluated": cli_gate.evaluated, + "migration_hint": cli_hint, } assert mcp["summary"]["total"] == cli_result.summary.total assert mcp["summary"]["active"] == cli_result.summary.active diff --git a/tests/unit/core/test_run.py b/tests/unit/core/test_run.py index b1fa6203..0840733d 100644 --- a/tests/unit/core/test_run.py +++ b/tests/unit/core/test_run.py @@ -8,7 +8,13 @@ from wardline.core.errors import ConfigError from wardline.core.finding import Finding, Kind, Location, Severity, SuppressionState from wardline.core.judged import JudgedFP, write_judged -from wardline.core.run import ScanResult, ScanSummary, gate_decision, run_scan +from wardline.core.run import ( + ScanResult, + ScanSummary, + baseline_migration_hint, + gate_decision, + run_scan, +) FIXTURE = Path("tests/fixtures/sample_project") @@ -236,6 +242,55 @@ def test_gate_decision_no_threshold_has_no_reason() -> None: assert decision.reason is None and decision.evaluated is None +def _hint(proj: Path, *, new_since=None, trust=False): + result = run_scan(proj, new_since=new_since, trust_suppressions=trust) + decision = gate_decision(result, Severity.ERROR) + return baseline_migration_hint(result, decision, root=proj, new_since=new_since) + + +def test_migration_hint_fires_on_baselined_only_trip(tmp_path: Path) -> None: + # The dogfood #3 'my repo went red with no code change' case: a committed baseline + # that used to clear the gate now re-enters it. Emit a loud one-liner pointing at + # the escape hatches and the upgrade note. + proj, fp = _leaky_proj(tmp_path) + _write_baseline(proj, fp) + hint = _hint(proj) + assert hint is not None + assert "baseline" in hint + assert "--trust-suppressions" in hint and "--new-since" in hint + assert "UPGRADING" in hint + + +def test_migration_hint_silent_under_trust_suppressions(tmp_path: Path) -> None: + proj, fp = _leaky_proj(tmp_path) + _write_baseline(proj, fp) + assert _hint(proj, trust=True) is None + + +def test_migration_hint_silent_under_new_since(tmp_path: Path) -> None: + # new_since scopes the gate (operator-supplied ratchet); the surprise — and the hint — + # belongs to the unscoped run. Assert the helper short-circuits on a non-None ref + # (tested directly so it does not require a git repo for the delta walk). + proj, fp = _leaky_proj(tmp_path) + _write_baseline(proj, fp) + result = run_scan(proj) + decision = gate_decision(result, Severity.ERROR) + assert baseline_migration_hint(result, decision, root=proj, new_since="origin/main") is None + + +def test_migration_hint_silent_on_genuine_active_trip(tmp_path: Path) -> None: + # An active (un-baselined) defect trips for a real reason — not a migration surprise. + proj, _ = _leaky_proj(tmp_path) + assert _hint(proj) is None + + +def test_migration_hint_silent_without_baseline_file(tmp_path: Path) -> None: + # A waiver-only trip is real debt, not the baseline-rollout surprise this hint is for. + proj, fp = _leaky_proj(tmp_path) + _write_waiver(proj, fp) + assert _hint(proj) is None + + def test_gate_findings_is_unsuppressed_population(tmp_path: Path) -> None: proj, fp = _leaky_proj(tmp_path) _write_baseline(proj, fp) From c1f5da211e8ab01d78e6136f3db79f177f89b3e7 Mon Sep 17 00:00:00 2001 From: John Morrissey Date: Sat, 6 Jun 2026 13:00:23 +1000 Subject: [PATCH 04/17] =?UTF-8?q?feat(mcp):=20scan=20payload=20controls=20?= =?UTF-8?q?=E2=80=94=20where=20shrinks=20payload,=20summary=5Fonly/max=5Ff?= =?UTF-8?q?indings/include=5Fsuppressed,=20default=20explain=20cap=20(ward?= =?UTF-8?q?line-2957009961)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Dogfood friction #5: the documented cost lever (`where`) did not control cost and one-shot `explain:true` was unusable on a real repo. - `where` now filters the agent_summary arrays too (it only filtered the top-level findings list before) — a filter matching 0 findings no longer returns dozens of suppressed findings inline. agent_summary build takes a display_findings view; its summary COUNTS stay whole-project. - New `summary_only:true` (counts + gate, no bodies — smallest "did the gate pass?" payload), `include_suppressed:false` (drop suppressed bodies; counts stay), `max_findings:N` (cap returned bodies). - DEFAULT explain ceiling: `explain:true` inlined provenance for EVERY active defect (56,820 chars on one line over a whole repo). Capped at 25 by default; max_findings tightens it. Findings past the cap are still returned, sans inline explanation. - New `truncation` block (findings_total/findings_returned/findings_truncated/ explanations_truncated/summary_only/include_suppressed/max_findings) so a bounded payload is never mistaken for "covered everything." CLI --format agent-summary is byte-unchanged (defaults preserve whole-project, uncapped behaviour). Docs (agents.md, legis-handoff.md --allow-dirty) + CHANGELOG updated. Full suite 2476 green; ruff/mypy/mkdocs-strict clean. Co-Authored-By: Claude Opus 4.8 (1M context) --- CHANGELOG.md | 15 ++++ docs/guides/agents.md | 18 +++++ docs/guides/legis-handoff.md | 13 ++++ src/wardline/core/agent_summary.py | 55 +++++++++++--- src/wardline/mcp/server.py | 82 +++++++++++++++++++- tests/unit/mcp/test_server_query_explain.py | 83 +++++++++++++++++++++ 6 files changed, 253 insertions(+), 13 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 83428657..0c4714d9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,21 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added +- **MCP `scan` payload controls — `where` now shrinks the payload, plus + `summary_only` / `max_findings` / `include_suppressed` and a default explain cap + (dogfood friction #5).** `where` previously filtered only the top-level `findings` + list; the `agent_summary` arrays still inlined every suppressed finding, so a filter + matching zero findings still returned dozens. `where` now filters the `agent_summary` + arrays too. New args: `summary_only: true` (counts + gate, no finding bodies — the + smallest "did the gate pass?" payload), `include_suppressed: false` (drop suppressed + bodies; counts stay in `summary`), and `max_findings: N` (cap the returned bodies). + `explain: true` no longer inlines provenance for *every* active defect — the one-shot + blowup that returned 56,820 chars on one line — it is capped at 25 by default + (tighten with `max_findings`). Every cut is reported in a new `truncation` block + (`findings_total` / `findings_returned` / `findings_truncated` / + `explanations_truncated`) so a bounded payload never reads as "covered everything." + `summary`/`gate` always describe the whole project; the CLI `--format agent-summary` + output is unchanged. - **The `--fail-on` gate verdict now explains itself (dogfood friction #2/#3).** A scan reporting `summary.active: 0` while `gate.tripped: true` no longer reads as a bug. The gate block (CLI stderr, MCP `scan` result, and the agent-summary) carries a human diff --git a/docs/guides/agents.md b/docs/guides/agents.md index 8e26c209..64d4d5c9 100644 --- a/docs/guides/agents.md +++ b/docs/guides/agents.md @@ -224,6 +224,24 @@ Resources expose the trust vocabulary, rule catalog, config, and config schema. The `wardline:loop` prompt documents the intended scan → explain → fix-at-the-boundary → rescan cycle. +`scan` payload controls (the `summary`/`gate` blocks always describe the whole +project — these only bound the returned finding bodies): + +- `where` — a conjunctive read-lens (keys: `rule_id`, `qualname`, `severity`, + `suppression`, `kind`, `path_glob`, `sink`, `tier`) that filters **both** the + `findings` list and the `agent_summary` arrays. +- `summary_only: true` — counts + gate only, no finding bodies. The smallest + "did the gate pass?" payload. +- `include_suppressed: false` — drop suppressed (baselined/waived/judged) bodies; + the suppression counts stay in `summary`. +- `max_findings: N` — cap the returned bodies (and inlined explanations). +- `explain: true` — inline each active defect's provenance; capped at 25 by + default (tighten with `max_findings`). + +Every cut is reported in the response `truncation` block (`findings_total`, +`findings_returned`, `findings_truncated`, `explanations_truncated`) so a bounded +payload never reads as "covered everything." + With an opt-in Loomweave taint store configured (`wardline mcp --loomweave-url `), `explain_taint` becomes a query when you pass the finding's `qualname` as `sink_qualname`: a fresh fact is served from the store without re-scanning diff --git a/docs/guides/legis-handoff.md b/docs/guides/legis-handoff.md index 1ee85bff..72820e22 100644 --- a/docs/guides/legis-handoff.md +++ b/docs/guides/legis-handoff.md @@ -61,6 +61,19 @@ as `unverified` — the trust-the-agent posture before a key is set). `tree_sha` that does not match the scanned content is false provenance, so it is refused rather than emitted. +!!! tip "Dev/tour loop on a dirty tree: `--allow-dirty`" + Signing is clean-tree-only, but you do not need a commit to exercise the + Wardline→legis handshake. Pass `--allow-dirty` (CLI) / `allow_dirty: true` (MCP + `scan`) to emit an **unsigned**, clearly-marked artifact on a dirty tree: + + ```bash + wardline scan . --format legis --allow-dirty --output /tmp/scan.legis.json + ``` + + The artifact carries `"dirty": true` and **no** `artifact_signature`; legis records + it as `unverified`. The committed tree is never signed as if it described dirty + working content. Use it for the dev loop and the tour — never to gate CI. + ### From the MCP server (agents) The `scan` tool attaches the artifact automatically once the secret is provisioned — diff --git a/src/wardline/core/agent_summary.py b/src/wardline/core/agent_summary.py index 6405ebeb..b90c2407 100644 --- a/src/wardline/core/agent_summary.py +++ b/src/wardline/core/agent_summary.py @@ -46,19 +46,46 @@ class AgentSummary: gate: GateDecision filigree_emit: dict[str, Any] = field(default_factory=_default_filigree_status) loomweave_write: dict[str, Any] = field(default_factory=_default_loomweave_status) + # Payload-shrinking controls (dogfood #4). The summary COUNTS always describe the + # whole project; these govern only the inline finding ARRAYS. ``display_findings`` + # is the (already ``where``-filtered) view the arrays are built from — None means the + # whole result, the back-compat default used by the CLI ``--format agent-summary``. + display_findings: list[Finding] | None = None + summary_only: bool = False + max_findings: int | None = None + include_suppressed: bool = True def to_dict(self) -> dict[str, Any]: - active_defects = [_finding_entry(f, include_next=True) for f in _active_defects(self.result.findings)] - suppressed = [_finding_entry(f, include_next=False) for f in _suppressed_defects(self.result.findings)] - engine_facts = [_finding_entry(f, include_next=False) for f in _engine_facts(self.result.findings)] + # Counts are whole-project (summary describes the whole project, per the `where` + # contract); arrays come from the displayed/filtered view, then bounded. + count_active = len(_active_defects(self.result.findings)) + count_suppressed = len(_suppressed_defects(self.result.findings)) + count_facts = len(_engine_facts(self.result.findings)) + + base = self.result.findings if self.display_findings is None else self.display_findings + if self.summary_only: + shown_active: list[Finding] = [] + shown_suppressed: list[Finding] = [] + shown_facts: list[Finding] = [] + else: + shown_active = _active_defects(base) + shown_suppressed = _suppressed_defects(base) if self.include_suppressed else [] + shown_facts = _engine_facts(base) + if self.max_findings is not None: + shown_active = shown_active[: self.max_findings] + shown_suppressed = shown_suppressed[: self.max_findings] + shown_facts = shown_facts[: self.max_findings] + active_defects = [_finding_entry(f, include_next=True) for f in shown_active] + suppressed = [_finding_entry(f, include_next=False) for f in shown_suppressed] + engine_facts = [_finding_entry(f, include_next=False) for f in shown_facts] return { "schema": SCHEMA, "summary": { "files_scanned": self.result.files_scanned, "total_findings": self.result.summary.total, - "active_defects": len(active_defects), - "suppressed_findings": len(suppressed), - "engine_facts": len(engine_facts), + "active_defects": count_active, + "suppressed_findings": count_suppressed, + "engine_facts": count_facts, "baselined": self.result.summary.baselined, "waived": self.result.summary.waived, "judged": self.result.summary.judged, @@ -78,7 +105,9 @@ def to_dict(self) -> dict[str, Any]: "active_defects": active_defects, "suppressed_findings": suppressed, "engine_facts": engine_facts, - "next_actions": _next_actions(active_defects), + # next_actions follow the whole-project active count, not the displayed slice, + # so a summary_only/filtered view does not falsely say "no active defects". + "next_actions": _next_actions_for(count_active), } @@ -148,8 +177,8 @@ def _finding_entry(finding: Finding, *, include_next: bool) -> dict[str, Any]: return entry -def _next_actions(active_defects: list[dict[str, Any]]) -> list[dict[str, Any]]: - if not active_defects: +def _next_actions_for(active_count: int) -> list[dict[str, Any]]: + if active_count == 0: return [{"tool": "scan", "reason": "no active defects; rescan after edits"}] return [ {"tool": "explain_taint", "reason": "inspect each active defect before editing"}, @@ -164,10 +193,18 @@ def build_agent_summary( *, filigree_emit: dict[str, Any] | None = None, loomweave_write: dict[str, Any] | None = None, + display_findings: list[Finding] | None = None, + summary_only: bool = False, + max_findings: int | None = None, + include_suppressed: bool = True, ) -> AgentSummary: return AgentSummary( result=result, gate=gate, filigree_emit=filigree_emit or _default_filigree_status(), loomweave_write=loomweave_write or _default_loomweave_status(), + display_findings=display_findings, + summary_only=summary_only, + max_findings=max_findings, + include_suppressed=include_suppressed, ) diff --git a/src/wardline/mcp/server.py b/src/wardline/mcp/server.py index e051c0fc..f0188a2b 100644 --- a/src/wardline/mcp/server.py +++ b/src/wardline/mcp/server.py @@ -234,9 +234,41 @@ def _scan( except (ValueError, WardlineError) as exc: # An unknown filter key or SEI resolution failure is agent-actionable -> isError result. raise ToolError(str(exc)) from exc + + # Payload-shrinking controls (dogfood #4). The `summary`/`gate` blocks always + # describe the WHOLE project; these only bound the returned finding bodies. + summary_only = bool(args.get("summary_only") or False) + raw_include = args.get("include_suppressed") + include_suppressed = True if raw_include is None else bool(raw_include) + max_findings = args.get("max_findings") + if max_findings is not None and ( + not isinstance(max_findings, int) or isinstance(max_findings, bool) or max_findings < 0 + ): + raise ToolError("max_findings must be a non-negative integer") explain = bool(args.get("explain")) + + # include_suppressed:false drops the suppressed DEFECT bodies (counts stay whole). + if not include_suppressed: + selected = [f for f in selected if not (f.kind is Kind.DEFECT and f.suppressed is not SuppressionState.ACTIVE)] + findings_total = len(selected) + + # summary_only returns no finding bodies at all (the smallest "did the gate pass?" + # payload); otherwise an explicit max_findings bounds the list (default: uncapped). + display = [] if summary_only else selected + findings_truncated = False + if max_findings is not None and len(display) > max_findings: + display = display[:max_findings] + findings_truncated = True + + # explain has a DEFAULT ceiling: inlining EVERY active defect's provenance is the + # 56KB-on-one-line blowup the dogfood report hit. Cap the number of explanations (an + # explicit max_findings tightens it further); findings past the cap are still + # returned, just without inline provenance. The cut is announced, never silent. + explain_cap = max_findings if max_findings is not None else _EXPLAIN_DEFAULT_CAP + explanations_attached = 0 + explanations_truncated = False findings_out: list[dict[str, Any]] = [] - for f in selected: + for f in display: d = _finding_to_dict(f) if ( explain @@ -245,8 +277,12 @@ def _scan( and f.qualname is not None and result.context is not None ): - exp = explanation_from_context(f, result.context) - d["explanation"] = _explanation_to_dict(exp) + if explanations_attached < explain_cap: + exp = explanation_from_context(f, result.context) + d["explanation"] = _explanation_to_dict(exp) + explanations_attached += 1 + else: + explanations_truncated = True findings_out.append(d) from wardline.core.agent_summary import build_agent_summary @@ -264,6 +300,16 @@ def _scan( # silent under-scan reaches the agent, not just the human-facing stderr. "unanalyzed": result.summary.unanalyzed, }, + # Make every cut explicit so a bounded payload never reads as "covered all". + "truncation": { + "summary_only": summary_only, + "include_suppressed": include_suppressed, + "max_findings": max_findings, + "findings_total": findings_total, + "findings_returned": len(findings_out), + "findings_truncated": findings_truncated, + "explanations_truncated": explanations_truncated, + }, "gate": { "tripped": decision.tripped, "fail_on": decision.fail_on, @@ -281,6 +327,10 @@ def _scan( decision, filigree_emit=filigree_status, loomweave_write=loomweave_status, + display_findings=selected, + summary_only=summary_only, + max_findings=max_findings, + include_suppressed=include_suppressed, ).to_dict(), } _attach_legis_artifact( @@ -643,6 +693,11 @@ def _fix(args: dict[str, Any], root: Path) -> dict[str, Any]: # fail_on=NONE is not a meaningful gate threshold. _SEVERITY_ENUM = ["CRITICAL", "ERROR", "WARN", "INFO"] +# Default ceiling on the number of active-defect provenances inlined by `explain: true` +# on the MCP `scan`. Bounds the one-shot payload (the dogfood report hit 56,820 chars on +# one line over a whole repo); an explicit `max_findings` tightens it further. +_EXPLAIN_DEFAULT_CAP = 25 + class WardlineMCPServer: def __init__( @@ -782,7 +837,26 @@ def _register_tools(self) -> None: "type": "boolean", "description": "Inline each active defect's taint provenance " "(immediate tainted callee, source boundary, trust tiers, resolution " - "counts) — one call instead of an explain_taint per finding.", + "counts) — one call instead of an explain_taint per finding. Inlining is " + "capped at 25 provenances by default (tighten with max_findings); the cut " + "is reported at truncation.explanations_truncated.", + }, + "summary_only": { + "type": "boolean", + "description": "Return counts + gate only, no finding bodies — the smallest " + "'did the gate pass?' payload. summary/gate still describe the whole project.", + }, + "max_findings": { + "type": "integer", + "description": "Cap the number of returned finding bodies (and inlined " + "explanations). The cut is reported in the truncation block; summary counts " + "stay whole-project.", + }, + "include_suppressed": { + "type": "boolean", + "description": "Default true. Set false to drop suppressed (baselined/waived/" + "judged) finding bodies from the response; the suppression counts stay in " + "summary.", }, "new_since": { "type": "string", diff --git a/tests/unit/mcp/test_server_query_explain.py b/tests/unit/mcp/test_server_query_explain.py index a7de0ce0..fb39f1e7 100644 --- a/tests/unit/mcp/test_server_query_explain.py +++ b/tests/unit/mcp/test_server_query_explain.py @@ -4,6 +4,24 @@ from wardline.mcp.server import ToolError, _scan + +def _many_leaks(n: int) -> str: + head = "from wardline.decorators import external_boundary, trusted\n@external_boundary\ndef raw(p):\n return p\n" + body = "".join(f"@trusted\ndef leak_{i}(p):\n return raw(p)\n" for i in range(n)) + return head + body + + +def _baseline_all(tmp_path) -> None: + # Baseline every PY-WL-101 finding so they all become suppressed=baselined. + from wardline.core.baseline import write_baseline + from wardline.core.run import run_scan + + scan = run_scan(tmp_path) + defects = [f for f in scan.findings if f.rule_id == "PY-WL-101"] + bl = tmp_path / ".wardline" / "baseline.yaml" + bl.parent.mkdir(parents=True, exist_ok=True) + write_baseline(bl, defects) + # Two boundaries + two trusted leaks → PY-WL-101 fires on both leaks. _SRC = ( "from wardline.decorators import external_boundary, trusted\n" @@ -66,3 +84,68 @@ def test_explain_matches_single_finding_explain(tmp_path): single = _explain_taint({"fingerprint": f["fingerprint"]}, tmp_path) # All six explanation keys must match the single-finding explain projection. assert f["explanation"] == {k: single[k] for k in f["explanation"]} + + +# --- dogfood #4: payload shrinking ------------------------------------------ + + +def test_where_filters_agent_summary_arrays(tmp_path): + # Symptom (a): where matching 0 findings still returned all 34 suppressed inline. + # The agent_summary finding arrays must respect `where`; summary counts stay whole. + (tmp_path / "svc.py").write_text(_many_leaks(5), encoding="utf-8") + _baseline_all(tmp_path) + out = _scan({"where": {"suppression": "active", "severity": "CRITICAL"}}, tmp_path) + assert out["findings"] == [] # 0 active CRITICAL + summ = out["agent_summary"] + assert summ["suppressed_findings"] == [] + assert summ["active_defects"] == [] + # but the whole-project counts are preserved + assert summ["summary"]["suppressed_findings"] == 5 + assert out["summary"]["baselined"] == 5 + + +def test_explain_true_has_default_cap(tmp_path): + # Blocker (c): bare explain:true over a many-defect repo must NOT inline every + # provenance (the 56KB-on-one-line symptom). A DEFAULT ceiling bounds it, and the + # truncation is announced — never silent. + (tmp_path / "svc.py").write_text(_many_leaks(40), encoding="utf-8") + out = _scan({"explain": True}, tmp_path) + explained = [f for f in out["findings"] if "explanation" in f] + assert 0 < len(explained) <= 25 # default cap + assert out["truncation"]["explanations_truncated"] is True + # the true total is still reported, so nothing is silently hidden + assert out["summary"]["active"] == 40 + + +def test_summary_only_omits_finding_arrays(tmp_path): + # (d): the "did the gate pass?" payload — counts + gate, no finding bodies. + (tmp_path / "svc.py").write_text(_many_leaks(5), encoding="utf-8") + out = _scan({"summary_only": True, "fail_on": "ERROR"}, tmp_path) + assert out["findings"] == [] + summ = out["agent_summary"] + assert summ["active_defects"] == [] and summ["suppressed_findings"] == [] and summ["engine_facts"] == [] + # counts + gate intact + assert out["summary"]["active"] == 5 + assert out["gate"]["tripped"] is True + assert out["truncation"]["summary_only"] is True + + +def test_include_suppressed_false_drops_suppressed(tmp_path): + # (b): drop the suppressed bodies from both surfaces; keep the counts. + (tmp_path / "svc.py").write_text(_many_leaks(5), encoding="utf-8") + _baseline_all(tmp_path) + out = _scan({"include_suppressed": False}, tmp_path) + assert all(f["suppressed"] == "active" for f in out["findings"]) + assert out["agent_summary"]["suppressed_findings"] == [] + # whole-project count still visible + assert out["summary"]["baselined"] == 5 + + +def test_max_findings_caps_and_marks(tmp_path): + # (b): bound the returned list and announce the cut. + (tmp_path / "svc.py").write_text(_many_leaks(10), encoding="utf-8") + out = _scan({"max_findings": 3}, tmp_path) + assert len(out["findings"]) == 3 + assert out["truncation"]["findings_truncated"] is True + assert out["truncation"]["findings_returned"] == 3 + assert out["truncation"]["findings_total"] >= 10 From e79ef143c426e57cca09c231589e13d48ec79901 Mon Sep 17 00:00:00 2001 From: John Morrissey Date: Sat, 6 Jun 2026 13:06:25 +1000 Subject: [PATCH 05/17] fix(gate): count the verdict reason over the actual gate population (wardline-be75c6676d follow-up) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The gate reason counted `gate_breakdown(result.findings)` — the annotated population — so under `--new-since` a delta-scoped-out defect (converted to BASELINED by apply_delta_scope) was wrongly counted as "suppressed >= threshold", inflating the count and pointing at `--new-since` (already supplied). _gate_reason now classifies the defects that ACTUALLY gate (the unsuppressed gate population, where out-of-delta defects are BASELINED and so excluded) by their state in the emitted findings. The count is exactly what tripped the gate; the `--new-since` path no longer over-counts. The trust-suppressions branch is unchanged (gate == emitted findings there). Locked by extending the new_since differential to assert 1, not 2. Verified: legis `ScanResultsIn.scan` is typed `dict` (arbitrary mapping), so the new unsigned `dirty:true` marker rides through intake untouched — confirmed the dev artifact stays postable. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/wardline/core/run.py | 29 ++++++++++++++++++++++++----- tests/unit/core/test_run.py | 8 +++++++- 2 files changed, 31 insertions(+), 6 deletions(-) diff --git a/src/wardline/core/run.py b/src/wardline/core/run.py index edf82751..dbd12c46 100644 --- a/src/wardline/core/run.py +++ b/src/wardline/core/run.py @@ -363,18 +363,37 @@ def _gates(severity: Severity, fail_on: Severity) -> bool: def _gate_reason(result: ScanResult, fail_on: Severity, *, tripped: bool, honors_suppressions: bool) -> str: - """The human verdict string. Counts the ANNOTATED population (``result.findings``) - so the numbers match what the agent reads in ``summary``.""" + """The human verdict string, counted over the ACTUAL gate population so the numbers + are exactly what tripped it.""" from wardline.core.suppression import gate_breakdown sev = fail_on.value - active, suppressed = gate_breakdown(result.findings, fail_on) if not tripped: return f"no {sev}+ defects in the evaluated population" - # Under --trust-suppressions the suppressed defects are honored (cleared), so only - # active ones can have tripped the gate; never misdirect to the suppression flags. + # Under --trust-suppressions the gate IS the annotated findings (suppressions + # honored), so only genuinely-active defects can have tripped it; never misdirect to + # the suppression flags. if honors_suppressions: + active, _ = gate_breakdown(result.findings, fail_on) return f"{active} active {sev}+ defect(s) at or above {sev}" + # Secure default: classify the defects that ACTUALLY gate (the unsuppressed gate + # population) by their state in the emitted findings. A ``--new-since`` delta scopes + # out-of-delta defects to BASELINED in the gate population too, so they are not ACTIVE + # here and are correctly NOT counted — the reason never inflates with scoped-out + # findings nor points at a flag that was already supplied. + gate_pop = result.gate_findings or [] + emitted_state = {f.fingerprint: f.suppressed for f in result.findings} + active = 0 + suppressed = 0 + for f in gate_pop: + if f.kind is not Kind.DEFECT or f.maturity is Maturity.PREVIEW: + continue + if f.suppressed is not SuppressionState.ACTIVE or not _gates(f.severity, fail_on): + continue + if emitted_state.get(f.fingerprint, SuppressionState.ACTIVE) is SuppressionState.ACTIVE: + active += 1 + else: + suppressed += 1 escape = "pass --trust-suppressions (trusted checkout) or --new-since (PR)" if active and suppressed: return f"{active} active + {suppressed} suppressed {sev}+ defect(s) gate by default; {escape}" diff --git a/tests/unit/core/test_run.py b/tests/unit/core/test_run.py index 0840733d..f807bc8d 100644 --- a/tests/unit/core/test_run.py +++ b/tests/unit/core/test_run.py @@ -419,7 +419,13 @@ def run_dispatch(args, **kwargs): # The out-of-delta unrelated.h is scoped OUT of the gate (delta: unchanged). assert gate_by_qn["unrelated.h"].suppressed is SuppressionState.BASELINED # Net: the gate trips on the new defect, and the repo baseline did not clear it. - assert gate_decision(result, Severity.ERROR).tripped is True + decision = gate_decision(result, Severity.ERROR) + assert decision.tripped is True + # The verdict reason counts only what ACTUALLY gates: caller.f (in-delta, repo-baselined + # -> 1 suppressed). unrelated.h is delta-scoped-out (BASELINED in the gate population), + # so it must NOT inflate the count — exactly 1, not 2. + assert decision.reason is not None + assert "1 suppressed" in decision.reason and "2 suppressed" not in decision.reason def test_run_scan_counts_unanalyzed_parse_error(tmp_path: Path) -> None: From 34a012e168cfb49cd7c019f7ec50896d8db6f7c1 Mon Sep 17 00:00:00 2001 From: John Morrissey Date: Sat, 6 Jun 2026 13:08:11 +1000 Subject: [PATCH 06/17] fix(mcp): tighten default explain cap to 10 (wardline-2957009961 follow-up) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The reported one-shot blowup was 56,820 chars over 34 findings and exceeded the tool token limit; a default of 25 inlined provenances was still uncomfortably close. Lower the default ceiling to 10 — comfortably under the limit, still plenty to triage in one call — and let max_findings RAISE it when the agent explicitly accepts the larger payload (summary_only covers the common "did the gate pass?" case). New test locks that max_findings can lift the count above the default. Docs/CHANGELOG updated. Co-Authored-By: Claude Opus 4.8 (1M context) --- CHANGELOG.md | 4 ++-- docs/guides/agents.md | 4 ++-- src/wardline/mcp/server.py | 4 ++-- tests/unit/mcp/test_server_query_explain.py | 12 +++++++++++- 4 files changed, 17 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0c4714d9..61129668 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,8 +17,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 smallest "did the gate pass?" payload), `include_suppressed: false` (drop suppressed bodies; counts stay in `summary`), and `max_findings: N` (cap the returned bodies). `explain: true` no longer inlines provenance for *every* active defect — the one-shot - blowup that returned 56,820 chars on one line — it is capped at 25 by default - (tighten with `max_findings`). Every cut is reported in a new `truncation` block + blowup that returned 56,820 chars on one line — it is capped at 10 by default + (raise/lower with `max_findings`). Every cut is reported in a new `truncation` block (`findings_total` / `findings_returned` / `findings_truncated` / `explanations_truncated`) so a bounded payload never reads as "covered everything." `summary`/`gate` always describe the whole project; the CLI `--format agent-summary` diff --git a/docs/guides/agents.md b/docs/guides/agents.md index 64d4d5c9..d0ab3d36 100644 --- a/docs/guides/agents.md +++ b/docs/guides/agents.md @@ -235,8 +235,8 @@ project — these only bound the returned finding bodies): - `include_suppressed: false` — drop suppressed (baselined/waived/judged) bodies; the suppression counts stay in `summary`. - `max_findings: N` — cap the returned bodies (and inlined explanations). -- `explain: true` — inline each active defect's provenance; capped at 25 by - default (tighten with `max_findings`). +- `explain: true` — inline each active defect's provenance; capped at 10 by + default (raise/lower with `max_findings`). Every cut is reported in the response `truncation` block (`findings_total`, `findings_returned`, `findings_truncated`, `explanations_truncated`) so a bounded diff --git a/src/wardline/mcp/server.py b/src/wardline/mcp/server.py index f0188a2b..76dc0d9e 100644 --- a/src/wardline/mcp/server.py +++ b/src/wardline/mcp/server.py @@ -696,7 +696,7 @@ def _fix(args: dict[str, Any], root: Path) -> dict[str, Any]: # Default ceiling on the number of active-defect provenances inlined by `explain: true` # on the MCP `scan`. Bounds the one-shot payload (the dogfood report hit 56,820 chars on # one line over a whole repo); an explicit `max_findings` tightens it further. -_EXPLAIN_DEFAULT_CAP = 25 +_EXPLAIN_DEFAULT_CAP = 10 class WardlineMCPServer: @@ -838,7 +838,7 @@ def _register_tools(self) -> None: "description": "Inline each active defect's taint provenance " "(immediate tainted callee, source boundary, trust tiers, resolution " "counts) — one call instead of an explain_taint per finding. Inlining is " - "capped at 25 provenances by default (tighten with max_findings); the cut " + "capped at 10 provenances by default (raise/lower with max_findings); the cut " "is reported at truncation.explanations_truncated.", }, "summary_only": { diff --git a/tests/unit/mcp/test_server_query_explain.py b/tests/unit/mcp/test_server_query_explain.py index fb39f1e7..100fdd36 100644 --- a/tests/unit/mcp/test_server_query_explain.py +++ b/tests/unit/mcp/test_server_query_explain.py @@ -111,12 +111,22 @@ def test_explain_true_has_default_cap(tmp_path): (tmp_path / "svc.py").write_text(_many_leaks(40), encoding="utf-8") out = _scan({"explain": True}, tmp_path) explained = [f for f in out["findings"] if "explanation" in f] - assert 0 < len(explained) <= 25 # default cap + assert 0 < len(explained) <= 10 # default cap assert out["truncation"]["explanations_truncated"] is True # the true total is still reported, so nothing is silently hidden assert out["summary"]["active"] == 40 +def test_max_findings_can_raise_explain_cap_above_default(tmp_path): + # max_findings is the explicit knob: it can RAISE the inlined-explanation count above + # the conservative default (10) when the agent accepts the larger payload. + (tmp_path / "svc.py").write_text(_many_leaks(20), encoding="utf-8") + out = _scan({"explain": True, "max_findings": 20}, tmp_path) + explained = [f for f in out["findings"] if "explanation" in f] + assert len(explained) > 10 # exceeded the default cap + assert out["truncation"]["explanations_truncated"] is False + + def test_summary_only_omits_finding_arrays(tmp_path): # (d): the "did the gate pass?" payload — counts + gate, no finding bodies. (tmp_path / "svc.py").write_text(_many_leaks(5), encoding="utf-8") From fb557b8491076d5f24506b9e966d3597ca5c8a98 Mon Sep 17 00:00:00 2001 From: John Morrissey Date: Sat, 6 Jun 2026 17:46:13 +1000 Subject: [PATCH 07/17] =?UTF-8?q?fix(gate):=20next=5Factions=20are=20gate-?= =?UTF-8?q?aware=20=E2=80=94=20never=20say=20"passed"=20on=20a=20tripped?= =?UTF-8?q?=20gate=20(wardline-be75c6676d=20follow-up)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Dogfood re-test, #2 "Worse" half: when the gate trips solely on baselined findings summary.active is 0, so next_actions said "no active defects; rescan after edits" — telling the agent it PASSED while the gate FAILED. _next_actions_for now takes the GateDecision. With 0 active defects but a tripped gate it emits a scan action whose reason names the gate failure + the escape hatches (trust_suppressions / new_since / clear the baseline; see gate.reason / gate.migration_hint) instead of the passive "rescan after edits". The active>0 and genuinely-clean paths are unchanged. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/wardline/core/agent_summary.py | 36 +++++++++++++++++++-------- tests/unit/core/test_agent_summary.py | 21 ++++++++++++++++ 2 files changed, 47 insertions(+), 10 deletions(-) diff --git a/src/wardline/core/agent_summary.py b/src/wardline/core/agent_summary.py index b90c2407..984b86dd 100644 --- a/src/wardline/core/agent_summary.py +++ b/src/wardline/core/agent_summary.py @@ -106,8 +106,10 @@ def to_dict(self) -> dict[str, Any]: "suppressed_findings": suppressed, "engine_facts": engine_facts, # next_actions follow the whole-project active count, not the displayed slice, - # so a summary_only/filtered view does not falsely say "no active defects". - "next_actions": _next_actions_for(count_active), + # so a summary_only/filtered view does not falsely say "no active defects" — and + # they are GATE-AWARE so a baselined-only trip (0 active + gate FAILED) never + # reads as "rescan after edits" / passed (dogfood #2, the "Worse" half). + "next_actions": _next_actions_for(count_active, self.gate), } @@ -177,14 +179,28 @@ def _finding_entry(finding: Finding, *, include_next: bool) -> dict[str, Any]: return entry -def _next_actions_for(active_count: int) -> list[dict[str, Any]]: - if active_count == 0: - return [{"tool": "scan", "reason": "no active defects; rescan after edits"}] - return [ - {"tool": "explain_taint", "reason": "inspect each active defect before editing"}, - {"tool": "file_finding", "reason": "promote confirmed true positives after Filigree emission"}, - {"tool": "scan", "reason": "rescan after fixes to verify closure"}, - ] +def _next_actions_for(active_count: int, gate: GateDecision) -> list[dict[str, Any]]: + if active_count > 0: + return [ + {"tool": "explain_taint", "reason": "inspect each active defect before editing"}, + {"tool": "file_finding", "reason": "promote confirmed true positives after Filigree emission"}, + {"tool": "scan", "reason": "rescan after fixes to verify closure"}, + ] + if gate.tripped: + # 0 active defects but the gate FAILED — it tripped on suppressed/baselined findings. + # Do NOT say "rescan after edits" (which reads as passed); point at the gate verdict. + detail = gate.reason or "the gate tripped on suppressed (baselined/waived/judged) findings" + return [ + { + "tool": "scan", + "reason": ( + f"gate FAILED with 0 active defects — {detail}. To clear: pass " + "trust_suppressions (trusted checkout) or new_since (PR), or remove the " + "baseline/waiver/judged entries; see gate.reason / gate.migration_hint." + ), + } + ] + return [{"tool": "scan", "reason": "no active defects; rescan after edits"}] def build_agent_summary( diff --git a/tests/unit/core/test_agent_summary.py b/tests/unit/core/test_agent_summary.py index 9c6972a5..d6f7d531 100644 --- a/tests/unit/core/test_agent_summary.py +++ b/tests/unit/core/test_agent_summary.py @@ -73,6 +73,27 @@ def test_agent_summary_no_active_defects_still_has_next_actions(tmp_path: Path) assert out["next_actions"] == [{"tool": "scan", "reason": "no active defects; rescan after edits"}] +def test_agent_summary_next_actions_do_not_say_passed_when_gate_tripped(tmp_path: Path) -> None: + # Dogfood #2 (the "Worse" half): with the gate tripped solely on baselined findings, + # summary.active is 0 — but next_actions must NOT say "no active defects; rescan after + # edits" (which reads as PASSED). It must reflect the gate failure and the escape hatches. + (tmp_path / "svc.py").write_text(_LEAKY, encoding="utf-8") + scan = run_scan(tmp_path) + fp = next(f.fingerprint for f in scan.findings if f.rule_id == "PY-WL-101") + bl = tmp_path / ".wardline" / "baseline.yaml" + bl.parent.mkdir(parents=True, exist_ok=True) + write_baseline(bl, [next(f for f in scan.findings if f.fingerprint == fp)]) + rescan = run_scan(tmp_path) + out = build_agent_summary(rescan, gate_decision(rescan, Severity.ERROR)).to_dict() + + assert out["gate"]["tripped"] is True + assert out["summary"]["active_defects"] == 0 + reasons = " ".join(a["reason"].lower() for a in out["next_actions"]) + assert "no active defects; rescan after edits" not in reasons # must not imply pass + assert "gate" in reasons + assert "trust_suppressions" in reasons or "new_since" in reasons + + def test_agent_summary_surfaces_suppressed_findings(tmp_path: Path) -> None: (tmp_path / "svc.py").write_text(_LEAKY, encoding="utf-8") leak = next(f for f in run_scan(tmp_path).findings if f.rule_id == "PY-WL-101") From 35ff7c41d7a264fd52101311198b64813ddc6c35 Mon Sep 17 00:00:00 2001 From: John Morrissey Date: Sat, 6 Jun 2026 17:46:13 +1000 Subject: [PATCH 08/17] fix(filigree): distinguish 401 auth-rejected from transport-unreachable (wardline-53a44a3bb1) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Dogfood #5: a 401 (token absent from the CLI env) was reported as "could not reach Filigree" — a wrong diagnosis that sent the agent chasing a broken-bridge / wrong- endpoint theory. The prior seam work deliberately made 401/403 SOFT (auth failure must not crash the scan loop); that is kept — only the MESSAGE changes. EmitResult now carries `status` (the HTTP status when one reached us; None when the transport itself failed) and `auth_rejected` (the 401/403 case). The CLI prints "Filigree returned 401 (auth rejected) … set WARDLINE_FILIGREE_TOKEN" vs a 5xx "server error" vs the genuine "could not reach"; the MCP scan filigree_emit block and agent_summary carry the same discriminated disabled_reason. 401/403 stays reachable=False (non-load-bearing), never exit-2. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/wardline/cli/scan.py | 34 ++++++++++++++++++--- src/wardline/core/filigree_emit.py | 26 +++++++++++----- src/wardline/mcp/server.py | 15 ++++++++- tests/unit/cli/test_cli.py | 26 ++++++++++++++++ tests/unit/core/test_filigree_emit.py | 23 ++++++++++++++ tests/unit/mcp/test_server_filigree_emit.py | 18 +++++++++++ 6 files changed, 129 insertions(+), 13 deletions(-) diff --git a/src/wardline/cli/scan.py b/src/wardline/cli/scan.py index 99e345e2..2cca874d 100644 --- a/src/wardline/cli/scan.py +++ b/src/wardline/cli/scan.py @@ -301,10 +301,24 @@ def confirm_cb(rel_path: str, orig: str, replacement: str, f: Finding) -> bool: raise SystemExit(2) from exc if emit_result is not None: if not emit_result.reachable: - click.echo( - f"warning: could not reach Filigree at {filigree_url}; findings written locally only.", - err=True, - ) + if emit_result.auth_rejected: + # Reachable but refused — actionable, NOT "could not reach" (dogfood #5). + click.echo( + f"warning: Filigree returned {emit_result.status} (auth rejected) at {filigree_url}; " + "set WARDLINE_FILIGREE_TOKEN (or .env) to the project token. Findings written locally only.", + err=True, + ) + elif emit_result.status is not None: + click.echo( + f"warning: Filigree returned {emit_result.status} (server error) at {filigree_url}; " + "findings written locally only.", + err=True, + ) + else: + click.echo( + f"warning: could not reach Filigree at {filigree_url}; findings written locally only.", + err=True, + ) else: line = ( f"emitted {len(findings)} finding(s) to {filigree_url} — " @@ -378,10 +392,20 @@ def _filigree_status(result: EmitResult | None) -> dict[str, object]: "updated": result.updated, "failed": result.failed, "warnings": list(result.warnings), - "disabled_reason": None if result.reachable else "filigree unreachable", + "disabled_reason": _filigree_unreachable_reason(result), } +def _filigree_unreachable_reason(result: EmitResult) -> str | None: + if result.reachable: + return None + if result.auth_rejected: + return f"filigree auth-rejected ({result.status}); set WARDLINE_FILIGREE_TOKEN" + if result.status is not None: + return f"filigree server error ({result.status})" + return "filigree unreachable" + + def _loomweave_status(result: object | None) -> dict[str, object]: if result is None: return { diff --git a/src/wardline/core/filigree_emit.py b/src/wardline/core/filigree_emit.py index 2b548b39..caff60d8 100644 --- a/src/wardline/core/filigree_emit.py +++ b/src/wardline/core/filigree_emit.py @@ -97,6 +97,14 @@ class EmitResult: updated: int = 0 failed: int = 0 warnings: tuple[str, ...] = () + # Discriminate WHY enrichment was unavailable so the caller can say the actionable + # thing instead of a flat "could not reach" (dogfood #5). ``status`` is the HTTP + # status when one reached us (401/403 auth-refused, 5xx outage) and None when the + # transport itself failed (connection refused / DNS / timeout — genuinely unreachable). + # ``auth_rejected`` is the 401/403 case: present-but-refusing-bearer-auth. All of these + # stay SOFT (reachable=False); only the message differs. + status: int | None = None + auth_rejected: bool = False class Transport(Protocol): @@ -142,14 +150,18 @@ def emit(self, findings: Sequence[Finding], *, scanned_paths: Sequence[str] = () resp = self._transport.post(self._url, body, headers) except (urllib.error.URLError, OSError): # Connection refused / DNS / timeout — sibling absent. Enrichment is - # non-load-bearing: warn (at the CLI) and continue. - return EmitResult(reachable=False) - if resp.status >= 500 or resp.status in (401, 403): - # Server-side outage (5xx) or auth refusal (401/403, Filigree present but its - # opt-in bearer auth is on and rejecting us) — the sibling is degraded/refusing, - # not a Wardline payload bug. Treat like absent (warn + continue) so a Filigree - # 503 or 401 never makes the gate load-bearing. + # non-load-bearing: warn (at the CLI) and continue. No status reached us, so + # this is the genuine "could not reach" case (status=None). return EmitResult(reachable=False) + if resp.status in (401, 403): + # Filigree is present but its opt-in bearer auth is on and refusing us. Stays + # SOFT (enrichment unavailable, never exit-2) — but distinguished as auth so the + # caller can say "401 (set WARDLINE_FILIGREE_TOKEN)" instead of "could not reach". + return EmitResult(reachable=False, status=resp.status, auth_rejected=True) + if resp.status >= 500: + # Server-side outage (5xx) — the sibling is degraded, not a Wardline payload bug. + # Treat like absent (warn + continue), carrying the status for an honest message. + return EmitResult(reachable=False, status=resp.status) if not 200 <= resp.status < 300: # 3xx (a redirect reached the client) or any remaining 4xx (notably 400): Wardline # sent a request the server would not accept — bad payload / wrong endpoint. Loud. diff --git a/src/wardline/mcp/server.py b/src/wardline/mcp/server.py index 76dc0d9e..14434232 100644 --- a/src/wardline/mcp/server.py +++ b/src/wardline/mcp/server.py @@ -59,6 +59,10 @@ def _emit_filigree( "updated": er.updated, "failed": er.failed, "warnings": list(er.warnings), + # Distinguish auth-rejected (401/403) from transport-unreachable so the agent reads + # an actionable reason, not a flat "unreachable" (dogfood #5). + "status": er.status, + "auth_rejected": er.auth_rejected, } @@ -73,7 +77,16 @@ def _filigree_emit_status(block: dict[str, Any] | None) -> dict[str, Any]: "warnings": [], "disabled_reason": "not configured", } - return {"configured": True, **block} + reachable = block.get("reachable") + if reachable: + disabled_reason = None + elif block.get("auth_rejected"): + disabled_reason = f"filigree auth-rejected ({block.get('status')}); set WARDLINE_FILIGREE_TOKEN" + elif block.get("status") is not None: + disabled_reason = f"filigree server error ({block.get('status')})" + else: + disabled_reason = "filigree unreachable" + return {"configured": True, "disabled_reason": disabled_reason, **block} def _loomweave_write_status(block: dict[str, Any] | None) -> dict[str, Any]: diff --git a/tests/unit/cli/test_cli.py b/tests/unit/cli/test_cli.py index 04ceb35f..9e76b7a1 100644 --- a/tests/unit/cli/test_cli.py +++ b/tests/unit/cli/test_cli.py @@ -773,6 +773,32 @@ def emit(self, findings, *, scanned_paths=()): assert "could not reach" in result.output.lower() +def test_scan_filigree_401_says_auth_not_unreachable(tmp_path, monkeypatch) -> None: + # Dogfood #5: a 401 (token absent) is reachable-but-refused, NOT transport-unreachable. + # The message must name the auth cause + the env var, never "could not reach". + proj = tmp_path / "proj" + proj.mkdir() + _write(proj, "svc.py", _LEAKY) + + class _AuthRejectedEmitter: + def __init__(self, url, **kw): + pass + + def emit(self, findings, *, scanned_paths=()): + from wardline.core.filigree_emit import EmitResult + + return EmitResult(reachable=False, status=401, auth_rejected=True) + + monkeypatch.setattr("wardline.cli.scan.FiligreeEmitter", _AuthRejectedEmitter) + out = tmp_path / "f.jsonl" + result = CliRunner().invoke(scan, [str(proj), "--output", str(out), "--filigree-url", "http://x"]) + assert result.exit_code == 0, result.output + low = result.output.lower() + assert "401" in result.output + assert "could not reach" not in low # the precise distinction the report asked for + assert "wardline_filigree_token" in low + + # --- SP9: wardline scan --loomweave-url --------------------------------------- # scan.py imports write_facts_to_loomweave lazily inside the `if loomweave_url` block # (`from wardline.loomweave.write import write_facts_to_loomweave`), so the binding diff --git a/tests/unit/core/test_filigree_emit.py b/tests/unit/core/test_filigree_emit.py index 44b21e1a..c0b573c8 100644 --- a/tests/unit/core/test_filigree_emit.py +++ b/tests/unit/core/test_filigree_emit.py @@ -154,6 +154,29 @@ def test_http_auth_refused_is_soft_not_loud(status: int) -> None: t = _FakeTransport(response=Response(status=status, body='{"error":"unauthorized"}')) res = FiligreeEmitter("http://x", transport=t).emit([_f()]) assert res.reachable is False + # ...but the RESULT must distinguish auth-rejected from transport-unreachable so the + # caller can say "401 (set WARDLINE_FILIGREE_TOKEN)" instead of "could not reach" + # (dogfood #5). 401/403 stays SOFT — only the message changes. + assert res.status == status + assert res.auth_rejected is True + + +def test_transport_unreachable_has_no_status_and_is_not_auth_rejected() -> None: + import urllib.error + + t = _FakeTransport(exc=urllib.error.URLError("connection refused")) + res = FiligreeEmitter("http://x", transport=t).emit([_f()]) + assert res.reachable is False + assert res.status is None # genuinely could-not-reach + assert res.auth_rejected is False + + +def test_http_5xx_carries_status_but_is_not_auth_rejected() -> None: + t = _FakeTransport(response=Response(status=503, body="upstream down")) + res = FiligreeEmitter("http://x", transport=t).emit([_f()]) + assert res.reachable is False + assert res.status == 503 + assert res.auth_rejected is False def test_bearer_token_carried_when_provided() -> None: diff --git a/tests/unit/mcp/test_server_filigree_emit.py b/tests/unit/mcp/test_server_filigree_emit.py index 2e47b25d..7e1fc0e8 100644 --- a/tests/unit/mcp/test_server_filigree_emit.py +++ b/tests/unit/mcp/test_server_filigree_emit.py @@ -61,6 +61,9 @@ def test_scan_emits_to_filigree_when_emitter_present(tmp_path): "updated": 1, "failed": 0, "warnings": [], + "status": None, + "auth_rejected": False, + "disabled_reason": None, } assert emitter.scanned_paths == ("svc.py",) @@ -78,6 +81,9 @@ def test_scan_reports_both_integrations_successful(tmp_path): "updated": 1, "failed": 0, "warnings": [], + "status": None, + "auth_rejected": False, + "disabled_reason": None, } @@ -122,4 +128,16 @@ def test_scan_unreachable_filigree_is_soft(tmp_path): assert out["filigree"]["reachable"] is False assert out["filigree_emit"]["configured"] is True assert out["filigree_emit"]["reachable"] is False + assert out["filigree_emit"]["disabled_reason"] == "filigree unreachable" assert out["summary"]["total"] >= 1 + + +def test_scan_filigree_401_surfaces_auth_reason_to_agent(tmp_path): + # Dogfood #5 (MCP parity): a 401 stays soft but the agent must read an actionable + # disabled_reason naming the token, not a flat "unreachable". + (tmp_path / "svc.py").write_text(_LEAKY, encoding="utf-8") + out = _scan({}, tmp_path, None, FakeEmitter(EmitResult(reachable=False, status=401, auth_rejected=True))) + assert out["filigree"]["reachable"] is False # still soft + reason = out["filigree_emit"]["disabled_reason"] + assert "401" in reason and "WARDLINE_FILIGREE_TOKEN" in reason + assert "unreachable" not in reason From 39b87efd38e0c5e0fba29452291faf79fcbd446e Mon Sep 17 00:00:00 2001 From: John Morrissey Date: Sat, 6 Jun 2026 17:48:23 +1000 Subject: [PATCH 09/17] docs(changelog): record next_actions gate-awareness (#2) and 401-vs-unreachable (#5) Co-Authored-By: Claude Opus 4.8 (1M context) --- CHANGELOG.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 61129668..4b02f371 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -42,6 +42,21 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 code change" case made self-explaining; the secure default itself is unchanged. ### Fixed +- **`next_actions` is gate-aware — never reads as "passed" when the gate failed + (dogfood re-test, #2).** When the gate trips solely on baselined findings, + `summary.active` is 0, so the agent-summary's `next_actions` used to say + *"no active defects; rescan after edits"* — telling the agent it passed while the + gate FAILED. It now emits a scan action naming the gate failure and the escape + hatches (trust_suppressions / new_since / clear the baseline; see `gate.reason` / + `gate.migration_hint`). The active-defects and genuinely-clean paths are unchanged. +- **CLI/MCP distinguish a Filigree `401` (auth-rejected) from transport-unreachable + (dogfood friction #5).** A `401` (token absent) was reported as *"could not reach + Filigree"*, sending agents to chase a broken-bridge theory. `EmitResult` now carries + `status` + `auth_rejected`; the CLI prints *"Filigree returned 401 (auth rejected) … + set WARDLINE_FILIGREE_TOKEN"* (and a distinct `5xx` "server error" vs the genuine + "could not reach"), and the MCP `scan` `filigree_emit` block / agent-summary carry the + same discriminated `disabled_reason`. `401`/`403` stays **soft** (non-load-bearing, + never exit-2) — only the message changed. - **`scan --format legis --allow-dirty` emits an unsigned dev artifact instead of refusing (dogfood friction #1).** On a dirty working tree `scan --format legis` failed `exit 2` naming an `allow_dirty` flag that was never exposed — presenting From fa23a10aa8110a55cd439a56678ee2883f888a54 Mon Sep 17 00:00:00 2001 From: John Morrissey Date: Sat, 6 Jun 2026 17:50:16 +1000 Subject: [PATCH 10/17] chore(uv): sync uv.lock to the loomweave extra (was stale 'clarion' since the rebrand) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit uv.lock still carried the pre-rebrand `clarion` optional-dependency extra; pyproject already renamed it to `loomweave` (Clarion→Loomweave). Regenerated to match — no dependency change (blake3 >=1.0, unchanged), just the extra name. Co-Authored-By: Claude Opus 4.8 (1M context) --- uv.lock | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/uv.lock b/uv.lock index c67b8fdf..474aea73 100644 --- a/uv.lock +++ b/uv.lock @@ -1081,13 +1081,13 @@ name = "wardline" source = { editable = "." } [package.optional-dependencies] -clarion = [ - { name = "blake3" }, -] docs = [ { name = "mkdocs" }, { name = "mkdocs-material" }, ] +loomweave = [ + { name = "blake3" }, +] scanner = [ { name = "click" }, { name = "jsonschema" }, @@ -1108,14 +1108,14 @@ dev = [ [package.metadata] requires-dist = [ - { name = "blake3", marker = "extra == 'clarion'", specifier = ">=1.0" }, + { name = "blake3", marker = "extra == 'loomweave'", specifier = ">=1.0" }, { name = "click", marker = "extra == 'scanner'", specifier = ">=8.0" }, { name = "jsonschema", marker = "extra == 'scanner'", specifier = ">=4.0" }, { name = "mkdocs", marker = "extra == 'docs'", specifier = ">=1.6" }, { name = "mkdocs-material", marker = "extra == 'docs'", specifier = ">=9.5" }, { name = "pyyaml", marker = "extra == 'scanner'", specifier = ">=6.0" }, ] -provides-extras = ["clarion", "docs", "scanner"] +provides-extras = ["docs", "loomweave", "scanner"] [package.metadata.requires-dev] dev = [ From a8103dff59a645a5c4f5774003597c2ea990daa7 Mon Sep 17 00:00:00 2001 From: John Morrissey Date: Sat, 6 Jun 2026 19:14:40 +1000 Subject: [PATCH 11/17] =?UTF-8?q?fix(ready-queue):=20resolve=203=20tickets?= =?UTF-8?q?=20=E2=80=94=20PY-WL-110=20weft=5Fmarkers=20gap,=20lambda=20bra?= =?UTF-8?q?nch-locality,=20finding-lifecycle=20glossary?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Resolves three Filigree ready-queue items, built TDD with adversarial review. PY-WL-110 weft_markers soundness gap (wardline-d62845bb18, P2) contradictory_trust.py hardcoded `wardline.decorators.*` as the only marker prefix, silently missing contradictory stacks imported from the renamed `weft_markers` shim. Now derives _MARKER_NAMES + _MARKER_MODULE_PREFIXES from BUILTIN_BOUNDARY_TYPES so the rule can't drift from the grammar. +2 tests. Lambda bindings are branch-local (wardline-36016d26f3, P3) _CURRENT_LAMBDA_BINDINGS was shared across if/else, try/except, match arms, leaking a lambda bound in one arm into siblings (over-fire). Each arm now walks an arm-local copy. NOTE: the first cut of the merge-out (clear()+full-union with the synthetic fall-through arm last) introduced a *false-negative regression* — verified empirically against HEAD: a lambda rebound in a no-else `if` / no-catch-all `match` and called after the branch resolved EXTERNAL_RAW on HEAD but INTEGRAL after the naive fix. Replaced with a delta merge (layer each arm's net add/changed bindings onto the pre-branch state in source order) that keeps the leak fix AND reproduces HEAD's after-branch bindings, so no new false negative. +3 over-fire guards, +3 no-false-negative guards. Finding-lifecycle vocabulary glossary (wardline-26e84dbd44, P3) Audited wardline's own usage: `active` is already the canonical word on every surface except the CLI summary, which printed `N new`. Relabelled to `N active` (text only; no JSON/SARIF/wire field renamed). Added the canonical glossary docs/reference/finding-lifecycle-vocabulary.md (single source of truth for new/active/suppressed/baselined/waived/judged + emitted-active vs gate population) with discipline tests + nav wiring. Cross-tool asks (Filigree first-seen "new", legis active) recorded as coordination context, not renamed. Full suite 2471 passed, ruff + mypy clean, mkdocs --strict OK. Co-Authored-By: Claude Opus 4.8 (1M context) --- CHANGELOG.md | 27 +++ docs/getting-started.md | 2 +- docs/guides/agents.md | 6 +- docs/guides/suppression.md | 9 +- docs/index.md | 2 +- .../reference/finding-lifecycle-vocabulary.md | 168 ++++++++++++++++++ mkdocs.yml | 1 + src/wardline/cli/scan.py | 7 +- .../scanner/rules/contradictory_trust.py | 16 +- src/wardline/scanner/taint/variable_level.py | 129 ++++++++++++-- tests/cli/__init__.py | 0 tests/cli/test_scan_summary_vocab.py | 75 ++++++++ tests/docs/__init__.py | 0 tests/docs/test_glossary_vocabulary.py | 29 +++ tests/unit/cli/test_cli.py | 4 +- .../scanner/rules/test_contradictory_trust.py | 34 ++++ tests/unit/scanner/rules/test_sink_rules.py | 65 +++++++ .../unit/scanner/taint/test_variable_level.py | 120 +++++++++++++ 18 files changed, 661 insertions(+), 33 deletions(-) create mode 100644 docs/reference/finding-lifecycle-vocabulary.md create mode 100644 tests/cli/__init__.py create mode 100644 tests/cli/test_scan_summary_vocab.py create mode 100644 tests/docs/__init__.py create mode 100644 tests/docs/test_glossary_vocabulary.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 4b02f371..300edf4f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -68,6 +68,23 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 marked `dirty: true` (legis records it `unverified`). Signing stays clean-tree-only; the loud refusal without `--allow-dirty` is unchanged. Lets the dev/tour loop exercise the Wardline→legis handshake without a commit. +- **PY-WL-110 (contradictory-trust) now fires for the `weft_markers` namespace + (soundness; `wardline-d62845bb18`).** The rule hardcoded + `wardline.decorators.*` as the only recognised marker prefix, so a contradictory + `@trusted` + `@external_boundary` stack imported from the renamed `weft_markers` + shim (the namespace authors are steered toward post-rebrand) was silently *not* + flagged. The prefix set is now derived from `BUILTIN_BOUNDARY_TYPES` + (`{wardline.decorators, weft_markers}`) so the rule cannot drift from the grammar + that seeds provenance. The other boundary rules read resolved provenance and never + had this gap. +- **Taint: lambda bindings are now branch-local (`wardline-36016d26f3`).** The + `_CURRENT_LAMBDA_BINDINGS` map was shared across `if`/`else`, `try`/`except`, and + `match` arms (unlike `var_taints`), so a lambda bound in one arm leaked into a + mutually-exclusive sibling and could over-fire (false positive) in adversarial + branch layouts. Each arm is now walked against an arm-local copy and re-converged by + layering each arm's *delta* onto the pre-branch state in source order — which both + removes the cross-arm leak and preserves a rebinding made in a no-`else` / no-catch-all + arm for a call after the branch (so no new false negative is introduced). - **Loomweave HMAC signer resync (auth path was 401ing every signed request).** Wardline's request signature drifted from Loomweave's verifier (ADR-042): the canonical message is now `METHOD\nPATH\nSHA256HEX(body)\nTIMESTAMP\nNONCE` (the @@ -88,6 +105,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 honest (both populations are the same length). ### Changed +- **CLI scan summary now labels the non-suppressed count `active`, not `new`** + (`wardline-26e84dbd44`). The human summary line previously printed + `… N new`, but every other surface — the `SuppressionState.ACTIVE` enum, the + `ScanSummary.active` field, the MCP `summary.active` key, the agent-summary + `active_defects` key, and the `wardline:loop` prompt — already said `active`. + The CLI now matches, so an agent never reconciles a CLI "N new" against an MCP + "active". Text-only (the count value is unchanged); no JSON/SARIF/wire field + renamed. The new [Finding lifecycle & gate vocabulary](https://github.com/foundryside-dev/wardline/blob/main/docs/reference/finding-lifecycle-vocabulary.md) + reference page is the single source of truth for these state words (and the + three distinct meanings of "new" across the suite). - **Filigree clients no longer crash the scan loop when Filigree auth is enabled.** `401`/`403` from `/api/weft/*` are now treated as **soft** (enrichment unavailable, like a 5xx/outage) across the emit and promote/file clients — previously a loud diff --git a/docs/getting-started.md b/docs/getting-started.md index 137ad2e7..34ea47b9 100644 --- a/docs/getting-started.md +++ b/docs/getting-started.md @@ -32,7 +32,7 @@ wardline scan . --format jsonl ``` ```text -scanned 2 file(s); 4 finding(s) — 0 suppressed (0 baseline / 0 waiver / 0 judged), 1 new -> findings.jsonl +scanned 2 file(s); 4 finding(s) — 0 suppressed (0 baseline / 0 waiver / 0 judged), 1 active -> findings.jsonl ``` !!! note "Where the findings go" diff --git a/docs/guides/agents.md b/docs/guides/agents.md index d0ab3d36..b100e6ea 100644 --- a/docs/guides/agents.md +++ b/docs/guides/agents.md @@ -105,7 +105,7 @@ By default a scan reports but never fails — the gate is opt-in: ```console $ wardline scan . -scanned 1 file(s); 3 finding(s) — 0 suppressed (0 baseline / 0 waiver / 0 judged), 1 new -> findings.jsonl +scanned 1 file(s); 3 finding(s) — 0 suppressed (0 baseline / 0 waiver / 0 judged), 1 active -> findings.jsonl ``` ```console @@ -118,7 +118,7 @@ at or above the threshold drives a non-zero exit: ```console $ wardline scan . --fail-on ERROR -scanned 1 file(s); 3 finding(s) — 0 suppressed (0 baseline / 0 waiver / 0 judged), 1 new -> findings.jsonl +scanned 1 file(s); 3 finding(s) — 0 suppressed (0 baseline / 0 waiver / 0 judged), 1 active -> findings.jsonl ``` ```console @@ -190,7 +190,7 @@ a sibling Weft tool — emit SARIF 2.1.0: ```console $ wardline scan . --format sarif --output results.sarif --fail-on ERROR -scanned 1 file(s); 3 finding(s) — 0 suppressed (0 baseline / 0 waiver / 0 judged), 1 new -> results.sarif +scanned 1 file(s); 3 finding(s) — 0 suppressed (0 baseline / 0 waiver / 0 judged), 1 active -> results.sarif ``` The log is standard SARIF 2.1.0 with a `wardline` driver and one result per diff --git a/docs/guides/suppression.md b/docs/guides/suppression.md index 7c3946a9..b0c5e8fb 100644 --- a/docs/guides/suppression.md +++ b/docs/guides/suppression.md @@ -18,9 +18,14 @@ breakdown: ```console $ wardline scan . -scanned 2 file(s); 4 finding(s) — 1 suppressed (1 baseline / 0 waiver / 0 judged), 0 new -> findings.jsonl +scanned 2 file(s); 4 finding(s) — 1 suppressed (1 baseline / 0 waiver / 0 judged), 0 active -> findings.jsonl ``` +The trailing count is the number of **active** (non-suppressed) defects. For the +precise meaning of every state word — `active`, `baselined`, `waived`, `judged`, +and the three distinct meanings of "new" — see +[Finding lifecycle & gate vocabulary](../reference/finding-lifecycle-vocabulary.md). + ## Suppressions and the `--fail-on` gate (read this first) All three layers — baseline, waiver, judged — live in **committed repository @@ -129,7 +134,7 @@ non-empty string; a duplicate fingerprint or a non-ISO `expires` is a hard error ```console $ wardline scan . -scanned 2 file(s); 4 finding(s) — 1 suppressed (0 baseline / 1 waiver / 0 judged), 0 new -> findings.jsonl +scanned 2 file(s); 4 finding(s) — 1 suppressed (0 baseline / 1 waiver / 0 judged), 0 active -> findings.jsonl ``` Expiry is **inclusive**: a waiver is active through its `expires` day and lapses diff --git a/docs/index.md b/docs/index.md index 9795b7ac..abf67db1 100644 --- a/docs/index.md +++ b/docs/index.md @@ -33,7 +33,7 @@ wardline scan . --format jsonl ``` ```text -scanned 2 file(s); 4 finding(s) — 0 suppressed (0 baseline / 0 waiver / 0 judged), 1 new -> findings.jsonl +scanned 2 file(s); 4 finding(s) — 0 suppressed (0 baseline / 0 waiver / 0 judged), 1 active -> findings.jsonl ``` In JSONL mode the findings are written to `findings.jsonl` in the current diff --git a/docs/reference/finding-lifecycle-vocabulary.md b/docs/reference/finding-lifecycle-vocabulary.md new file mode 100644 index 00000000..dc8922ca --- /dev/null +++ b/docs/reference/finding-lifecycle-vocabulary.md @@ -0,0 +1,168 @@ +# Finding lifecycle & gate vocabulary + +This is the single source of truth for the words Wardline uses to describe the +**state and lifecycle of a finding** — `new`, `active`, `suppressed`, +`baselined`, `waived`, `judged` — and how each one maps onto the three surfaces +an agent reads: the **CLI summary line**, the **MCP / agent-summary JSON**, and +the **Filigree store**. + +It is deliberately distinct from the [Trust vocabulary](vocabulary.md), which +documents the *trust-tier* markers (`trusted`, `trust_boundary`, +`external_boundary`) the engine reasons about. That page is about what data is +trusted; this page is about what happens to a finding once it is produced. + +Every claim below cites a real `file:line` so the vocabulary stays anchored to +the code. The discipline test `tests/docs/test_glossary_vocabulary.py` fails if a +`SuppressionState` value is added without being documented here. + +## The categories of a finding + +Before lifecycle state, two orthogonal axes classify every finding: + +| Axis | Values | Defined at | +| --- | --- | --- | +| `kind` | `defect`, `fact`, `classification`, `metric`, `suggestion` | `src/wardline/core/finding.py:59-65` (`Kind`) | +| `severity` | `CRITICAL`, `ERROR`, `WARN`, `INFO`, `NONE` | `src/wardline/core/finding.py:51-56` (`Severity`) | + +Only `Kind.DEFECT` findings are ever suppressed or gated; facts and metrics +(`Severity.NONE`) never participate in the `--fail-on` gate +(`src/wardline/core/suppression.py:20-22`, `src/wardline/core/suppression.py:37-39`). + +## The four suppression states + +`SuppressionState` (`src/wardline/core/finding.py:67-71`) has exactly four +values. Every emitted `DEFECT` carries exactly one: + +| State | Meaning | Set by | +| --- | --- | --- | +| `active` | Not suppressed — the default. A live defect. | default (`src/wardline/core/finding.py:68`, `src/wardline/core/finding.py:103`) | +| `baselined` | Matched a fingerprint in `.wardline/baseline.yaml`. | `src/wardline/core/suppression.py:70` | +| `waived` | Matched an unexpired waiver in `wardline.yaml`. | `src/wardline/core/suppression.py:65-66` | +| `judged` | The LLM triage judge ruled it a false positive (`.wardline/judged.yaml`). | `src/wardline/core/suppression.py:67-68` | + +When more than one layer matches a finding, **precedence is +waiver > judged > baseline** — explicit human intent wins, then the LLM verdict +(so its rationale is the visible reason), then the silent baseline +(`src/wardline/core/suppression.py:61-70`). + +**"suppressed"** is the umbrella term for "any state other than `active`": +`baselined` + `waived` + `judged`. The CLI prints this sum as the `suppressed` +count (`src/wardline/cli/scan.py:355`), and `to_filigree_metadata` only writes a +`suppressed` key when the state is not `active` +(`src/wardline/core/finding.py:184-187`). + +## `active` is the one word for "non-suppressed defect" + +The canonical term for a live, non-suppressed defect is **`active`** — +consistently, on every surface: + +| Surface | Where | Term | +| --- | --- | --- | +| Enum | `src/wardline/core/finding.py:68` | `SuppressionState.ACTIVE = "active"` | +| Summary field | `src/wardline/core/run.py:49`, built at `src/wardline/core/run.py:264` | `ScanSummary.active` | +| CLI summary line | `src/wardline/cli/scan.py:356` | `… {s.active} active` | +| MCP scan response | `src/wardline/mcp/server.py:307` | `summary.active` | +| Agent-summary JSON | `src/wardline/core/agent_summary.py:86` | `summary.active_defects` | +| `wardline:loop` prompt | `src/wardline/mcp/prompts.py:13` | "Read `summary.active`" | + +The agent-summary key is `active_defects` rather than bare `active` — that is a +descriptive-suffix convention alongside `total_findings` / `suppressed_findings` +(`src/wardline/core/agent_summary.py:85-92`), not a different concept. It counts +the same population. + +The discipline test `tests/cli/test_scan_summary_vocab.py` pins this: the CLI +line says `active` (never `new`), and the count matches the agent-summary and MCP +surfaces. + +## The three meanings of "new" + +"new" is overloaded across the suite. Wardline's own surfaces no longer use it +for the active count (that was a historical CLI mislabel, now `active`). The word +still legitimately means three different things depending on the surface: + +| "new" on this surface | Means | Owner / anchor | +| --- | --- | --- | +| Filigree store | An **unseen fingerprint** — first time this finding identity is seen for a `(file, scan_source)`. Driven by `mark_unseen` / the absent-fingerprint sweep. | **Filigree-owned** lifecycle (`src/wardline/core/filigree_emit.py:68-76`) | +| `wardline scan --new-since ` | **Delta-scope**: the gate fires only on defects in files/entities changed since a git ref; everything else is re-marked `baselined`. | `src/wardline/core/run.py:240-259`; help text `src/wardline/cli/scan.py` (`--new-since`, "new findings only") | +| (historical) CLI summary | Formerly relabelled the `active` count as "N new". **Corrected to "N active"** so the CLI matches every other surface. | `src/wardline/cli/scan.py:356` | + +The first-seen Filigree sense and the delta-scope `--new-since` sense are +genuinely distinct concepts; neither is "active". An agent should read the CLI / +MCP `active` count as "live defects now", Filigree's first-seen status as "is this +identity new to the tracker", and `--new-since` as "only gate on what changed". + +## Emitted-active vs the gate population + +There are **two distinct populations** of defects in one scan, and they can +differ on purpose: + +1. **Emitted-active** — `summary.active` counts `active` defects in the + **emitted** (post-annotation) findings (`src/wardline/core/run.py:262-265`). + Baseline / waiver / judged annotate these findings in place; a suppressed + defect is still emitted, just not counted as `active`. + +2. **Gate population** — the `--fail-on` gate evaluates a **separate** + `ScanResult.gate_findings` list: the *unsuppressed* population + (`src/wardline/core/run.py:226-230`). By default, repository-controlled + baseline / waiver / judged entries **annotate** the emitted findings but do + **not** clear the gate — so a malicious PR cannot green the gate by committing + a suppression keyed to its own new defect. `gate_decision` evaluates + `gate_findings` when present, else falls back to `findings` (the trusted + `--trust-suppressions` / directly-constructed path) + (`src/wardline/core/run.py:290-292`). + +This is why **`summary.active: 0` can co-exist with `gate.tripped: true`**: every +defect was suppressed by a committed baseline (so emitted-active is 0), but those +suppressions do not clear the unsuppressed gate population. It is by design, not a +bug. The gate result is reported separately from `summary.active`: `GateDecision` +carries `tripped` / `fail_on` / `exit_class` **plus** a human `reason` and the +`evaluated` population it judged (`src/wardline/core/run.py:82-92`), so the +`0 active + tripped` case explains itself instead of reading as a defect. The MCP +`scan` block exposes `gate.tripped` / `gate.reason` / `gate.evaluated` / +`gate.migration_hint` (`src/wardline/mcp/server.py:327-332`); the CLI prints +`gate: FAILED (--fail-on …) — ` then `gate: evaluated <…>` on stderr +(`src/wardline/cli/scan.py:370`). + +`--new-since` scopes **both** populations identically: any `active` defect +outside the delta is re-marked `baselined` in both the emitted and gate lists +(`src/wardline/core/run.py:240-259`). + +## Cross-surface mapping table + +How each concept appears on each surface: + +| Concept | CLI summary text | `ScanSummary` field | MCP `summary` key | Agent-summary key | Filigree store | +| --- | --- | --- | --- | --- | --- | +| every finding | `N finding(s)` | `total` (`run.py:48`) | `total` (`server.py:306`) | `total_findings` (`agent_summary.py:85`) | one finding per wire entry | +| live defect | `N active` (`scan.py:356`) | `active` (`run.py:49,264`) | `active` (`server.py:307`) | `active_defects` (`agent_summary.py:86`) | no `suppressed` key (`finding.py:184`) | +| suppressed (sum) | `N suppressed` (`scan.py:355`) | `baselined+waived+judged` | the three keys | `suppressed_findings` (`agent_summary.py:87`) | `metadata.wardline.suppressed` (`finding.py:184-187`) | +| baselined | `N baseline` | `baselined` (`run.py:51`) | `baselined` (`server.py:308`) | `baselined` (`agent_summary.py:89`) | `suppressed: "baselined"` | +| waived | `N waiver` | `waived` (`run.py:52`) | `waived` (`server.py:309`) | `waived` (`agent_summary.py:90`) | `suppressed: "waived"` | +| judged | `N judged` | `judged` (`run.py:53`) | `judged` (`server.py:310`) | `judged` (`agent_summary.py:91`) | `suppressed: "judged"` | +| under-scan | `N file(s) could not be analyzed` | `unanalyzed` (`run.py:59`) | `unanalyzed` (`server.py:314`) | `unanalyzed` (`agent_summary.py:92`) | `WLN-ENGINE-*` facts | +| gate verdict | exit code + `--fail-on` | (`gate_findings`, `run.py:78`) | `gate.tripped` (`server.py:327`) | `gate.tripped` (`agent_summary.py:95`) | not emitted to Filigree | + +## For the suite + +This page is the **Wardline-anchored** glossary. Two pieces of the vocabulary are +owned by sibling tools and are intentionally **not** renamed by Wardline — they +are recorded here as coordination context, not as a change Wardline executes: + +- **Filigree's "new" / `seen_count` lifecycle is Filigree-owned.** Filigree + decides first-seen vs returning purely from fingerprint presence across scans + (`mark_unseen`, `src/wardline/core/filigree_emit.py:68-76`). Wardline emits the + fingerprint and `scanned_paths`; it does not, and should not, rename Filigree's + first-seen concept to match its own `active`. The two words mean different + things and that distinction is correct. + +- **legis receives the gate population as `active`.** The legis scan artifact + projects the *whole scan*, mapping `baselined` / `judged` onto legis's own + `suppressed` while `active` stays `active`, so legis reproduces Wardline's gate + population exactly (the "one judge" property). This is a contract Wardline + conforms to, not a rename of any other tool's fields (see the CHANGELOG legis + handoff entry and [Signed scan handoff to legis](../guides/legis-handoff.md)). + +In short: **within Wardline, `active` is the single word for a non-suppressed +defect, on every surface.** The remaining divergence is genuine cross-tool +semantics (Filigree's first-seen lifecycle, `--new-since` delta-scope) that this +glossary documents rather than collapses. No cross-repo rename is implied. diff --git a/mkdocs.yml b/mkdocs.yml index ff3c8564..c56e8896 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -76,6 +76,7 @@ nav: - Reference: - CLI: reference/cli.md - Trust vocabulary: reference/vocabulary.md + - Finding lifecycle & gate vocabulary: reference/finding-lifecycle-vocabulary.md - About: - Changelog: https://github.com/foundryside-dev/wardline/blob/main/CHANGELOG.md - Contributing: https://github.com/foundryside-dev/wardline/blob/main/CONTRIBUTING.md diff --git a/src/wardline/cli/scan.py b/src/wardline/cli/scan.py index 2cca874d..d51e01eb 100644 --- a/src/wardline/cli/scan.py +++ b/src/wardline/cli/scan.py @@ -345,10 +345,15 @@ def confirm_cb(rel_path: str, orig: str, replacement: str, f: Finding) -> bool: click.echo(line) s = result.summary unanalyzed_segment = f"; {s.unanalyzed} file(s) could not be analyzed" if s.unanalyzed else "" + # "active" = non-suppressed DEFECTs in the EMITTED findings — the canonical term + # used by SuppressionState.ACTIVE, ScanSummary.active, the MCP summary key, the + # agent-summary active_defects, and the wardline:loop prompt. It is NOT Filigree's + # first-seen "new" (unseen fingerprint) nor the --fail-on gate population + # (ScanResult.gate_findings). See docs/reference/finding-lifecycle-vocabulary.md. click.echo( f"scanned {result.files_scanned} file(s); {s.total} finding(s) — " f"{s.baselined + s.waived + s.judged} suppressed " - f"({s.baselined} baseline / {s.waived} waiver / {s.judged} judged), {s.active} new" + f"({s.baselined} baseline / {s.waived} waiver / {s.judged} judged), {s.active} active" f"{unanalyzed_segment} -> {output}" ) # A discovered-but-not-analysed file is a silent under-scan; never hide it. diff --git a/src/wardline/scanner/rules/contradictory_trust.py b/src/wardline/scanner/rules/contradictory_trust.py index f7b7a1f9..c085074c 100644 --- a/src/wardline/scanner/rules/contradictory_trust.py +++ b/src/wardline/scanner/rules/contradictory_trust.py @@ -30,10 +30,16 @@ if TYPE_CHECKING: from wardline.scanner.context import AnalysisContext -# The recognised trust-marker names (the grammar boundary types' canonical names). -# A custom grammar's markers are the agent's own concern; the builtin rule keys on -# the builtin vocabulary, which is the contract Wardline ships. +# The recognised trust-marker names (the grammar boundary types' canonical names) +# and the module prefixes they may be imported from. A custom grammar's markers are +# the agent's own concern; the builtin rule keys on the builtin vocabulary, which is +# the contract Wardline ships. Both names AND prefixes are derived from +# BUILTIN_BOUNDARY_TYPES so the rule cannot drift from the grammar — the prefix set +# is how ``wardline.decorators`` and the renamed ``weft_markers`` shim are BOTH +# recognised (wardline-d62845bb18: hardcoding only ``wardline.decorators`` silently +# missed contradictory stacks written against the recommended ``weft_markers`` shim). _MARKER_NAMES: frozenset[str] = frozenset(bt.canonical_name for bt in BUILTIN_BOUNDARY_TYPES) +_MARKER_MODULE_PREFIXES: frozenset[str] = frozenset(bt.module_prefix for bt in BUILTIN_BOUNDARY_TYPES) METADATA = RuleMetadata( rule_id="PY-WL-110", @@ -72,9 +78,7 @@ def _marker_canonical_name(deco: ast.expr, alias_map: Mapping[str, str]) -> str if fqn is None: return None last = fqn.rsplit(".", 1)[-1] - if last in {"external_boundary", "trust_boundary", "trusted"} and ( - fqn.startswith("wardline.decorators.") or fqn.startswith("wardline.decorators.trust.") - ): + if last in _MARKER_NAMES and any(fqn.startswith(prefix + ".") for prefix in _MARKER_MODULE_PREFIXES): return last return None diff --git a/src/wardline/scanner/taint/variable_level.py b/src/wardline/scanner/taint/variable_level.py index 2322f01d..25b8dbd2 100644 --- a/src/wardline/scanner/taint/variable_level.py +++ b/src/wardline/scanner/taint/variable_level.py @@ -1115,6 +1115,68 @@ def _taint_container_base( # ── Control flow handlers ──────────────────────────────────────── +def _branch_copy(parent: dict[str, ast.Lambda] | None) -> dict[str, ast.Lambda] | None: + """An arm-local copy of the lambda-bindings map for one branch arm (``None`` when + bindings are not being tracked — a degraded caller). Copying per arm is what keeps + a lambda bound inside one arm from leaking into a mutually-exclusive sibling arm + (wardline-36016d26f3), mirroring how ``var_taints`` is copied per arm.""" + return dict(parent) if parent is not None else None + + +def _walk_branch_body( + body: list[ast.stmt], + function_taint: TaintState, + taint_map: dict[str, TaintState], + var_taints: dict[str, TaintState], + call_site_taints: dict[int, dict[str, TaintState]] | None, + arm_bindings: dict[str, ast.Lambda] | None, +) -> None: + """Walk one branch arm's body with *arm_bindings* as the active (arm-local) + lambda-bindings map, so lambda assignments inside the arm mutate the copy, not the + shared parent. A plain ``_walk_body`` when bindings aren't tracked.""" + if arm_bindings is None: + _walk_body(body, function_taint, taint_map, var_taints, call_site_taints) + return + token = _CURRENT_LAMBDA_BINDINGS.set(arm_bindings) + try: + _walk_body(body, function_taint, taint_map, var_taints, call_site_taints) + finally: + _CURRENT_LAMBDA_BINDINGS.reset(token) + + +def _merge_branch_bindings( + parent: dict[str, ast.Lambda] | None, + arms: list[dict[str, ast.Lambda] | None], +) -> None: + """Merge mutually-exclusive branch arms' lambda bindings back into *parent* in + place. Each arm was walked against an arm-local *copy* of *parent*, so a binding + made in one arm cannot leak into a sibling arm during the walk + (wardline-36016d26f3); this re-converges the arms into the post-branch state. + + We layer each arm's *delta relative to the pre-branch state* onto *parent* in + source order — we do NOT clear and re-union. The distinction is load-bearing: an + arm is a full copy of the pre-branch bindings, so a name an arm never touched still + carries its pre-branch lambda. A clear-then-union (or a union that lets the implicit + no-``else`` / no-match-catch-all fall-through arm win last) would let such an + untouched arm *revert* a rebinding done in another arm — silently dropping a binding + the engine kept before branch-locality was added, i.e. a NEW false negative for a + sink reached through the rebound name after the branch. Applying only net + added/changed bindings, last-arm-in-source-order wins, reproduces the prior + after-branch bindings for every rebinding case (so no new false negative) while + keeping the branch-local leak fix. A name an arm *removed* (rebound to a non-lambda) + is left in place: that can only over-approximate (an extra resolution), never miss a + sink.""" + if parent is None: + return + pre = dict(parent) + for arm in arms: + if arm is None: + continue + for name, lam in arm.items(): + if pre.get(name) is not lam: + parent[name] = lam + + def _handle_if( stmt: ast.If, function_taint: TaintState, @@ -1128,18 +1190,25 @@ def _handle_if( # Snapshot before branches. pre_if = dict(var_taints) + parent_lambdas = _CURRENT_LAMBDA_BINDINGS.get() - # Walk the if-body. + # Walk the if-body with an arm-local lambda-bindings copy — branch-local like + # var_taints, so a lambda bound here cannot leak into the else arm. if_taints = dict(var_taints) - _walk_body(stmt.body, function_taint, taint_map, if_taints, call_site_taints) + if_lambdas = _branch_copy(parent_lambdas) + _walk_branch_body(stmt.body, function_taint, taint_map, if_taints, call_site_taints, if_lambdas) if stmt.orelse: - # Walk the else-body. + # Walk the else-body on its own arm-local bindings copy. else_taints = dict(var_taints) - _walk_body(stmt.orelse, function_taint, taint_map, else_taints, call_site_taints) + else_lambdas = _branch_copy(parent_lambdas) + _walk_branch_body(stmt.orelse, function_taint, taint_map, else_taints, call_site_taints, else_lambdas) else: - # No else — the "else" branch is the pre-if state. + # No else — the "else" branch is the pre-if state with bindings unchanged. else_taints = pre_if + else_lambdas = _branch_copy(parent_lambdas) + + _merge_branch_bindings(parent_lambdas, [if_lambdas, else_lambdas]) # Merge: for each variable, combine the two branch values. The var holds ONE # branch's value (an alternative), so combine via the rank-meet least_trusted @@ -1247,23 +1316,29 @@ def _handle_try( ) -> None: """Handle try/except/else/finally — snapshot-branch-join pattern.""" pre_try = dict(var_taints) + parent_lambdas = _CURRENT_LAMBDA_BINDINGS.get() - # Walk try body on a copy. + # Walk try body on a copy (arm-local lambda bindings — branch-local like var_taints). try_taints = dict(pre_try) - _walk_body(stmt.body, function_taint, taint_map, try_taints, call_site_taints) + try_lambdas = _branch_copy(parent_lambdas) + _walk_branch_body(stmt.body, function_taint, taint_map, try_taints, call_site_taints, try_lambdas) # Walk each handler on separate copies (mutually exclusive with try body). handler_branches: list[dict[str, TaintState]] = [try_taints] # try-success is one branch + arm_bindings: list[dict[str, ast.Lambda] | None] = [try_lambdas] for handler in stmt.handlers: handler_taints = dict(pre_try) if handler.name: handler_taints[handler.name] = function_taint - _walk_body(handler.body, function_taint, taint_map, handler_taints, call_site_taints) + handler_lambdas = _branch_copy(parent_lambdas) + _walk_branch_body(handler.body, function_taint, taint_map, handler_taints, call_site_taints, handler_lambdas) handler_branches.append(handler_taints) + arm_bindings.append(handler_lambdas) - # Walk orelse on try-success branch (runs only if no exception). + # Walk orelse on the try-success branch (runs only if no exception) — continue the + # try arm's bindings, not a fresh copy. if stmt.orelse: - _walk_body(stmt.orelse, function_taint, taint_map, try_taints, call_site_taints) + _walk_branch_body(stmt.orelse, function_taint, taint_map, try_taints, call_site_taints, try_lambdas) # Merge all branches. all_vars: set[str] = set() @@ -1290,7 +1365,12 @@ def _handle_try( except KeyError: _taint_val = None # var absent from pre-try state — leave unset - # finalbody runs unconditionally after merge. + # Lambda bindings: union the mutually-exclusive arms (try-success + each handler) + # back into the parent, mirroring the var_taints join above. + _merge_branch_bindings(parent_lambdas, arm_bindings) + + # finalbody runs unconditionally after merge — with the merged bindings (in place, + # the active contextvar dict, since the function body continues into it). if stmt.finalbody: _walk_body(stmt.finalbody, function_taint, taint_map, var_taints, call_site_taints) @@ -1317,20 +1397,32 @@ def _handle_match( subject_taint = _resolve_expr(stmt.subject, function_taint, taint_map, var_taints) pre_match = dict(var_taints) + parent_lambdas = _CURRENT_LAMBDA_BINDINGS.get() branches: list[dict[str, TaintState]] = [] + arm_bindings: list[dict[str, ast.Lambda] | None] = [] for case in stmt.cases: case_taints = dict(pre_match) for name in _collect_pattern_targets(case.pattern): case_taints[name] = subject_taint - if case.guard is not None: - # The guard is tested with the arm's captures in scope; resolve it for - # walrus side effects (binds into this arm's state). - _resolve_expr(case.guard, function_taint, taint_map, case_taints) - _walk_body(case.body, function_taint, taint_map, case_taints, call_site_taints) + # Arm-local lambda bindings (guard + body share the arm), branch-local like + # var_taints so a lambda bound in one case cannot leak into a sibling case. + case_lambdas = _branch_copy(parent_lambdas) + token = _CURRENT_LAMBDA_BINDINGS.set(case_lambdas) if case_lambdas is not None else None + try: + if case.guard is not None: + # The guard is tested with the arm's captures in scope; resolve it for + # walrus side effects (binds into this arm's state). + _resolve_expr(case.guard, function_taint, taint_map, case_taints) + _walk_body(case.body, function_taint, taint_map, case_taints, call_site_taints) + finally: + if token is not None: + _CURRENT_LAMBDA_BINDINGS.reset(token) branches.append(case_taints) + arm_bindings.append(case_lambdas) - # The implicit "no arm matched" path keeps the pre-match state. + # The implicit "no arm matched" path keeps the pre-match state and bindings. branches.append(pre_match) + arm_bindings.append(_branch_copy(parent_lambdas)) all_vars: set[str] = set() for branch in branches: @@ -1342,6 +1434,9 @@ def _handle_match( merged = combine(merged, v) var_taints[var] = merged + # Lambda bindings: union the mutually-exclusive case arms (+ no-match) into parent. + _merge_branch_bindings(parent_lambdas, arm_bindings) + # ── Helpers ────────────────────────────────────────────────────── diff --git a/tests/cli/__init__.py b/tests/cli/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/cli/test_scan_summary_vocab.py b/tests/cli/test_scan_summary_vocab.py new file mode 100644 index 00000000..91e6f2b4 --- /dev/null +++ b/tests/cli/test_scan_summary_vocab.py @@ -0,0 +1,75 @@ +"""Discipline tests pinning the finding-lifecycle vocabulary across surfaces. + +The canonical term for a non-suppressed DEFECT in the emitted findings is +"active" — used by the ``SuppressionState.ACTIVE`` enum, the ``ScanSummary.active`` +field, the MCP scan-response ``summary.active`` key, the agent-summary +``active_defects`` key, and the ``wardline:loop`` prompt. These tests pin the CLI +human summary line to the same word so an agent never has to reconcile a CLI +"N new" against an MCP "active". + +See ``docs/reference/finding-lifecycle-vocabulary.md``. +""" + +from __future__ import annotations + +import re +from pathlib import Path + +from click.testing import CliRunner + +from wardline.cli.main import cli +from wardline.core.agent_summary import build_agent_summary +from wardline.core.run import gate_decision, run_scan + +_ONE_ACTIVE_DEFECT = """from wardline.decorators import trust_boundary, external_boundary + +@external_boundary +def read_raw(p): + return p + +@trust_boundary(to_level='ASSURED') +def v(p): + return read_raw(p) +""" + + +def _write_fixture(tmp_path: Path) -> Path: + (tmp_path / "m.py").write_text(_ONE_ACTIVE_DEFECT, encoding="utf-8") + return tmp_path + + +def test_scan_summary_uses_active_not_new(tmp_path: Path) -> None: + _write_fixture(tmp_path) + res = CliRunner().invoke(cli, ["scan", str(tmp_path)]) + assert res.exit_code == 0, res.output + out = res.output + # The non-suppressed count is labelled "active", never "new". + assert re.search(r"\d+ active", out), out + assert not re.search(r"\d+ new\b", out), out + + +def test_active_term_consistent_across_surfaces(tmp_path: Path) -> None: + _write_fixture(tmp_path) + + result = run_scan(tmp_path) + decision = gate_decision(result, None) + n_active = result.summary.active + assert n_active == 1 + + # agent-summary: ``active_defects`` (descriptive-suffix convention) equals the count. + agent = build_agent_summary(result, decision).to_dict() + assert agent["summary"]["active_defects"] == n_active + + # MCP scan response: the summary key is "active" (and never "new"). + from wardline.mcp import server + + mcp_summary = server._scan({"path": "."}, tmp_path)["summary"] + assert mcp_summary["active"] == n_active + assert "new" not in mcp_summary + + # CLI human line: the count printed for "active" matches. + res = CliRunner().invoke(cli, ["scan", str(tmp_path)]) + assert res.exit_code == 0, res.output + m = re.search(r"(\d+) active", res.output) + assert m is not None, res.output + assert int(m.group(1)) == n_active diff --git a/tests/docs/__init__.py b/tests/docs/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/docs/test_glossary_vocabulary.py b/tests/docs/test_glossary_vocabulary.py new file mode 100644 index 00000000..8288434c --- /dev/null +++ b/tests/docs/test_glossary_vocabulary.py @@ -0,0 +1,29 @@ +"""Doc-discipline tests for the finding-lifecycle vocabulary glossary. + +The glossary at ``docs/reference/finding-lifecycle-vocabulary.md`` is the single +source of truth for the finding-state / gate-population vocabulary. These tests +keep it complete (every ``SuppressionState`` value documented) and wired into the +mkdocs nav (so ``mkdocs build --strict`` does not orphan it). +""" + +from __future__ import annotations + +from pathlib import Path + +from wardline.core.finding import SuppressionState + +_REPO = Path(__file__).parents[2] +_GLOSSARY = _REPO / "docs" / "reference" / "finding-lifecycle-vocabulary.md" +_MKDOCS = _REPO / "mkdocs.yml" +_NAV_PATH = "reference/finding-lifecycle-vocabulary.md" + + +def test_glossary_defines_every_suppression_state() -> None: + text = _GLOSSARY.read_text(encoding="utf-8") + for state in SuppressionState: + assert state.value in text, f"glossary is missing SuppressionState '{state.value}'" + + +def test_glossary_in_nav() -> None: + nav = _MKDOCS.read_text(encoding="utf-8") + assert _NAV_PATH in nav, f"{_NAV_PATH} is not wired into the mkdocs nav" diff --git a/tests/unit/cli/test_cli.py b/tests/unit/cli/test_cli.py index 9e76b7a1..a36f4a4e 100644 --- a/tests/unit/cli/test_cli.py +++ b/tests/unit/cli/test_cli.py @@ -1227,8 +1227,8 @@ def v(p): # Run scan with --fix and --yes res = CliRunner().invoke(cli, ["scan", str(tmp_path), "--fix", "--yes"]) assert res.exit_code == 0, res.output - # The scan output should show that the findings were fixed, and the re-run has 0 new defects - assert "0 new" in res.output + # The scan output should show that the findings were fixed, and the re-run has 0 active defects + assert "0 active" in res.output assert "raise ValueError" in m_py.read_text(encoding="utf-8") diff --git a/tests/unit/scanner/rules/test_contradictory_trust.py b/tests/unit/scanner/rules/test_contradictory_trust.py index b3c0e83a..6de42e71 100644 --- a/tests/unit/scanner/rules/test_contradictory_trust.py +++ b/tests/unit/scanner/rules/test_contradictory_trust.py @@ -120,3 +120,37 @@ def f(p): ) findings = _run(ctx) assert [(f.rule_id, f.qualname) for f in findings] == [("PY-WL-110", "m.f")] + + +def test_weft_markers_namespace_fires(tmp_path) -> None: + # wardline-d62845bb18: a contradictory stack imported from the renamed + # `weft_markers` shim must fire identically to `wardline.decorators` — it is + # a recognised boundary namespace in the builtin grammar (BUILTIN_BOUNDARY_TYPES). + ctx = _analyze( + tmp_path, + """ + from weft_markers import external_boundary, trusted + @trusted + @external_boundary + def conflicting(p): + return p + """, + ) + findings = _run(ctx) + assert [(f.rule_id, f.qualname) for f in findings] == [("PY-WL-110", "m.conflicting")] + + +def test_weft_markers_call_form_fires(tmp_path) -> None: + # The called form (@trusted(level=...) + @external_boundary) over weft_markers. + ctx = _analyze( + tmp_path, + """ + from weft_markers import external_boundary, trusted + @trusted(level='ASSURED') + @external_boundary + def conflicting(p): + return p + """, + ) + findings = _run(ctx) + assert [(f.rule_id, f.qualname) for f in findings] == [("PY-WL-110", "m.conflicting")] diff --git a/tests/unit/scanner/rules/test_sink_rules.py b/tests/unit/scanner/rules/test_sink_rules.py index 0ac2893d..b26dc0a9 100644 --- a/tests/unit/scanner/rules/test_sink_rules.py +++ b/tests/unit/scanner/rules/test_sink_rules.py @@ -365,6 +365,71 @@ def f(p): assert [(x.rule_id, x.qualname) for x in findings] == [("PY-WL-107", "m.f")] +def test_107_lambda_bound_in_sibling_if_arm_does_not_leak(tmp_path) -> None: + # Branch-locality regression (wardline-36016d26f3): a lambda bound in the + # if-arm must NOT leak into the mutually-exclusive else-arm. The else-arm + # calls ``cb(raw)`` — if the if-arm binding leaks, the at-call path binds + # x=raw into eval(x) and over-fires (false positive). Branch-local bindings + # mean the else-arm has no binding for ``cb``, so only the worst-ever second + # pass records eval(x) with x reset to function_taint (ASSURED) → clean. + ctx = _analyze( + tmp_path, + """ + @trusted(level='ASSURED') + def f(p, cond): + raw = read_raw(p) + if cond: + cb = lambda x: eval(x) + else: + cb(raw) + """, + ) + assert UntrustedToExec().check(ctx) == [] + + +def test_107_lambda_bound_in_try_arm_does_not_leak_to_handler(tmp_path) -> None: + # Branch-locality regression (wardline-36016d26f3) for try/except: a lambda + # bound in the try body must NOT leak into the except handler (a mutually + # exclusive arm). The handler calls ``cb(raw)``; a leaked binding would bind + # x=raw into eval(x) and over-fire. Branch-local try/handler bindings mean + # the handler has no binding for ``cb``. + ctx = _analyze( + tmp_path, + """ + @trusted(level='ASSURED') + def f(p): + raw = read_raw(p) + try: + cb = lambda x: eval(x) + except Exception: + cb(raw) + """, + ) + assert UntrustedToExec().check(ctx) == [] + + +def test_107_lambda_bound_in_match_arm_does_not_leak_to_sibling(tmp_path) -> None: + # Branch-locality regression (wardline-36016d26f3) for match/case: a lambda + # bound in the first case-arm must NOT leak into a sibling case-arm. The + # second arm calls ``cb(raw)``; a leaked binding would bind x=raw into + # eval(x) and over-fire. Branch-local case bindings mean the second arm has + # no binding for ``cb``. + ctx = _analyze( + tmp_path, + """ + @trusted(level='ASSURED') + def f(p, kind): + raw = read_raw(p) + match kind: + case "a": + cb = lambda x: eval(x) + case _: + cb(raw) + """, + ) + assert UntrustedToExec().check(ctx) == [] + + def test_108_raw_reaches_os_system_in_lambda_body(tmp_path) -> None: # The engine fix is sink-agnostic (shared _resolve_expr / worst_arg_taint): a # command sink in a lambda body fires flow-sensitively on real taint too. diff --git a/tests/unit/scanner/taint/test_variable_level.py b/tests/unit/scanner/taint/test_variable_level.py index 91e02cf2..ecc7c013 100644 --- a/tests/unit/scanner/taint/test_variable_level.py +++ b/tests/unit/scanner/taint/test_variable_level.py @@ -227,6 +227,126 @@ def test_walrus_inside_lambda_does_not_leak_to_enclosing_scope() -> None: assert "z" not in out +def _lambda_body_sink_arg(src: str) -> TaintState: + """Run the variable-taint pass over *src* (a function with a ``sink(c)`` call inside + a lambda body bound in one branch arm and a tainted ``cb(raw)`` call in a sibling + arm) and return the taint recorded for the lambda body's ``sink(c)`` argument. + + Used by the wardline-36016d26f3 branch-locality regression tests: if the lambda + binding leaks into the sibling arm, the else/handler/case arm's ``raw`` reaches the + lambda body and the recorded arg becomes EXTERNAL_RAW (the over-fire). Branch-local, + it stays INTEGRAL (if-arm direct call + the floor pass, both neutral).""" + func = ast.parse(src).body[0] + assert isinstance(func, ast.FunctionDef) + csat: dict[int, dict[int | str | None, TaintState]] = {} + compute_variable_taints( + func, + T.INTEGRAL, + {}, + call_site_taints={}, + alias_map={}, + call_site_arg_taints=csat, + param_meets={"raw": T.EXTERNAL_RAW}, + ) + sink_call = next( + n for n in ast.walk(func) if isinstance(n, ast.Call) and isinstance(n.func, ast.Name) and n.func.id == "sink" + ) + return csat[id(sink_call)][0] + + +def test_lambda_binding_is_branch_local_across_if_else() -> None: + # wardline-36016d26f3: a lambda bound in the if-arm must NOT leak into the sibling + # else-arm. _CURRENT_LAMBDA_BINDINGS was shared across branches (unlike var_taints, + # which is copied per arm), so `cb(raw)` in the else-arm — where `cb` is NOT bound + # to the lambda — spuriously resolved the if-arm's lambda body against the raw arg, + # over-tainting the body's inner `sink(c)` call. Over-fire only (the final + # worst-ever pass is the soundness floor), but a real false positive in adversarial + # branch layouts. + src = ( + "def handler(raw):\n" + " if flag:\n" + " cb = lambda c: sink(c)\n" + " cb('safe')\n" + " else:\n" + " cb(raw)\n" + ) + assert _lambda_body_sink_arg(src) == T.INTEGRAL + + +def test_lambda_binding_is_branch_local_across_try_except() -> None: + # Same leak class for mutually-exclusive try-success vs except-handler arms. + src = ( + "def handler(raw):\n" + " try:\n" + " cb = lambda c: sink(c)\n" + " cb('safe')\n" + " except Exception:\n" + " cb(raw)\n" + ) + assert _lambda_body_sink_arg(src) == T.INTEGRAL + + +def test_lambda_binding_is_branch_local_across_match() -> None: + # Same leak class for mutually-exclusive match case arms. + src = ( + "def handler(raw, kind):\n" + " match kind:\n" + " case 'a':\n" + " cb = lambda c: sink(c)\n" + " cb('safe')\n" + " case _:\n" + " cb(raw)\n" + ) + assert _lambda_body_sink_arg(src) == T.INTEGRAL + + +def test_lambda_rebinding_survives_no_else_if_for_post_branch_call() -> None: + # wardline-36016d26f3 (merge-OUT direction, no-false-negative guard): branch-local + # bindings must still re-converge so a rebinding made inside a no-`else` ``if`` + # survives for a call AFTER the branch. ``cb`` is bound to a safe lambda, rebound to + # the sink lambda in the if-arm, then ``cb(raw)`` runs after the branch — ``cb`` MAY + # be the sink lambda, so the body's ``sink(c)`` arg must stay EXTERNAL_RAW + # (conservative). A clear-then-union merge that let the implicit (no-else) + # fall-through arm win last reverted ``cb`` to the safe lambda and dropped the + # detection — a false negative the pre-branch-local engine did not have. + src = ( + "def handler(raw):\n" + " cb = lambda c: c\n" + " if flag:\n" + " cb = lambda c: sink(c)\n" + " cb(raw)\n" + ) + assert _lambda_body_sink_arg(src) == T.EXTERNAL_RAW + + +def test_lambda_rebinding_survives_match_without_catch_all_for_post_branch_call() -> None: + # Same merge-out no-false-negative guard for a ``match`` with no catch-all case: the + # synthetic no-match fall-through arm must not revert a case-arm rebinding. + src = ( + "def handler(raw, kind):\n" + " cb = lambda c: c\n" + " match kind:\n" + " case 'a':\n" + " cb = lambda c: sink(c)\n" + " cb(raw)\n" + ) + assert _lambda_body_sink_arg(src) == T.EXTERNAL_RAW + + +def test_lambda_rebinding_in_try_survives_into_finalbody() -> None: + # Same merge-out guard for try/finally: a rebinding in the try body must survive the + # branch join into the unconditionally-executed finalbody. + src = ( + "def handler(raw):\n" + " cb = lambda c: c\n" + " try:\n" + " cb = lambda c: sink(c)\n" + " finally:\n" + " cb(raw)\n" + ) + assert _lambda_body_sink_arg(src) == T.EXTERNAL_RAW + + def test_compute_return_taint_all_shapes() -> None: import ast import textwrap From 50b27a0acb5e55f21a79f9594d88bf3d98ec8554 Mon Sep 17 00:00:00 2001 From: John Morrissey Date: Sat, 6 Jun 2026 22:29:54 +1000 Subject: [PATCH 12/17] =?UTF-8?q?fix(dogfood):=20PR=20#30=20review=20harde?= =?UTF-8?q?ning=20=E2=80=94=20gate=20invariant,=20403/5xx=20split,=20MCP?= =?UTF-8?q?=20legis=20reason,=20strict=20arg=20validation?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Applies the PR #30 multi-reviewer findings (code/tests/errors/comments/types): - GateDecision.__post_init__ makes "tripped gate that reads as passed" (dogfood #2) unconstructible, not merely avoided by the factory. - Filigree 403 is now distinguished from 401 across all three render sites (CLI stderr, CLI disabled_reason, MCP) — "forbidden (token lacks access)" rather than the misleading "set WARDLINE_FILIGREE_TOKEN". - MCP dirty-unsigned legis artifact carries a loud `reason` (parity with the CLI "never gate CI on it" warning) — agent-first surfaces stay equally loud. - migration_hint threaded into the agent-summary gate block so the "see gate.migration_hint" pointer in next_actions resolves on that surface too. - Strict boolean validation for summary_only/include_suppressed/allow_dirty/ explain (reject non-bool rather than silently coercing "false"→True) + max_findings JSON schema gains `minimum: 0`. - CHANGELOG: payload-controls entry corrected to dogfood #4 (verified against the friction report: #4=payload, #5=auth); genuine-trip reason quoted verbatim. - Glossary file:line anchors tightened to the WAIVED/JUDGED assignment lines. Quality consolidation (behavior-preserving): shared severity_gates() and filigree_disabled_reason() helpers, enum-identity (`is`) unified. New tests pin 5xx rendering (CLI+MCP), the MCP legis dirty/signed projection, the mixed active+suppressed gate-reason branch, the GateDecision invariant guard, strict arg validation, and the agent-summary migration_hint. Suite 2515 passed; ruff/mypy clean; mkdocs --strict builds. Co-Authored-By: Claude Opus 4.8 (1M context) --- CHANGELOG.md | 18 ++-- .../reference/finding-lifecycle-vocabulary.md | 4 +- src/wardline/cli/scan.py | 39 ++++--- src/wardline/core/agent_summary.py | 7 ++ src/wardline/core/filigree_emit.py | 20 ++++ src/wardline/core/run.py | 29 +++-- src/wardline/core/suppression.py | 11 +- src/wardline/mcp/server.py | 51 ++++++--- tests/unit/cli/test_cli.py | 46 ++++++++ tests/unit/core/test_agent_summary.py | 22 ++++ tests/unit/core/test_run.py | 38 +++++++ tests/unit/mcp/test_server_filigree_emit.py | 26 +++++ tests/unit/mcp/test_server_legis_artifact.py | 102 ++++++++++++++++++ tests/unit/mcp/test_server_query_explain.py | 18 ++++ 14 files changed, 375 insertions(+), 56 deletions(-) create mode 100644 tests/unit/mcp/test_server_legis_artifact.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 300edf4f..ff80ab1d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,7 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - **MCP `scan` payload controls — `where` now shrinks the payload, plus `summary_only` / `max_findings` / `include_suppressed` and a default explain cap - (dogfood friction #5).** `where` previously filtered only the top-level `findings` + (dogfood friction #4).** `where` previously filtered only the top-level `findings` list; the `agent_summary` arrays still inlined every suppressed finding, so a filter matching zero findings still returned dozens. `where` now filters the `agent_summary` arrays too. New args: `summary_only: true` (counts + gate, no finding bodies — the @@ -28,8 +28,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 gate block (CLI stderr, MCP `scan` result, and the agent-summary) carries a human `reason` — e.g. `"34 suppressed ERROR+ defect(s) (baseline/waiver/judged) not cleared; pass --trust-suppressions (trusted checkout) or --new-since (PR)"` for a - suppressed-only trip, `"N active ERROR+ defect(s)"` for a genuine one (no misdirection - to the suppression flags) — and an `evaluated` string naming the judged population + suppressed-only trip, `"N active ERROR+ defect(s) at or above ERROR"` for a genuine one + (no misdirection to the suppression flags) — and an `evaluated` string naming the judged population (`unsuppressed …` by default vs `post-suppression … honored` under `--trust-suppressions`). Counts come from the annotated findings, so they match `summary`. @@ -37,9 +37,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 When a committed `.wardline/baseline.yaml` exists, the gate trips **solely** because baselined defects re-enter the unsuppressed population, and neither `--trust-suppressions` nor `--new-since` was passed, Wardline now prints a one-line - `migration:` hint (CLI stderr; MCP `scan` `gate.migration_hint`) pointing at the - escape hatches and the new **`UPGRADING.md`**. This is the "my repo went red with no - code change" case made self-explaining; the secure default itself is unchanged. + `migration:` hint (CLI stderr; MCP `scan` `gate.migration_hint`; and the agent-summary + `gate.migration_hint`) pointing at the escape hatches and the new **`UPGRADING.md`**. + This is the "my repo went red with no code change" case made self-explaining; the + secure default itself is unchanged. ### Fixed - **`next_actions` is gate-aware — never reads as "passed" when the gate failed @@ -55,8 +56,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 `status` + `auth_rejected`; the CLI prints *"Filigree returned 401 (auth rejected) … set WARDLINE_FILIGREE_TOKEN"* (and a distinct `5xx` "server error" vs the genuine "could not reach"), and the MCP `scan` `filigree_emit` block / agent-summary carry the - same discriminated `disabled_reason`. `401`/`403` stays **soft** (non-load-bearing, - never exit-2) — only the message changed. + same discriminated `disabled_reason`. A `403` is reported as *"forbidden (token present + but lacks access)"* rather than telling the agent to set a token that won't help. + `401`/`403` stays **soft** (non-load-bearing, never exit-2) — only the message changed. - **`scan --format legis --allow-dirty` emits an unsigned dev artifact instead of refusing (dogfood friction #1).** On a dirty working tree `scan --format legis` failed `exit 2` naming an `allow_dirty` flag that was never exposed — presenting diff --git a/docs/reference/finding-lifecycle-vocabulary.md b/docs/reference/finding-lifecycle-vocabulary.md index dc8922ca..e2544416 100644 --- a/docs/reference/finding-lifecycle-vocabulary.md +++ b/docs/reference/finding-lifecycle-vocabulary.md @@ -37,8 +37,8 @@ values. Every emitted `DEFECT` carries exactly one: | --- | --- | --- | | `active` | Not suppressed — the default. A live defect. | default (`src/wardline/core/finding.py:68`, `src/wardline/core/finding.py:103`) | | `baselined` | Matched a fingerprint in `.wardline/baseline.yaml`. | `src/wardline/core/suppression.py:70` | -| `waived` | Matched an unexpired waiver in `wardline.yaml`. | `src/wardline/core/suppression.py:65-66` | -| `judged` | The LLM triage judge ruled it a false positive (`.wardline/judged.yaml`). | `src/wardline/core/suppression.py:67-68` | +| `waived` | Matched an unexpired waiver in `wardline.yaml`. | `src/wardline/core/suppression.py:66` | +| `judged` | The LLM triage judge ruled it a false positive (`.wardline/judged.yaml`). | `src/wardline/core/suppression.py:68` | When more than one layer matches a finding, **precedence is waiver > judged > baseline** — explicit human intent wins, then the LLM verdict diff --git a/src/wardline/cli/scan.py b/src/wardline/cli/scan.py index d51e01eb..e2c6be69 100644 --- a/src/wardline/cli/scan.py +++ b/src/wardline/cli/scan.py @@ -11,7 +11,7 @@ from wardline.core.config import resolve_filigree_url, resolve_loomweave_url from wardline.core.emit import JsonlSink from wardline.core.errors import WardlineError -from wardline.core.filigree_emit import EmitResult, FiligreeEmitter +from wardline.core.filigree_emit import EmitResult, FiligreeEmitter, filigree_disabled_reason from wardline.core.finding import Severity from wardline.core.run import baseline_migration_hint, gate_decision, run_scan from wardline.core.sarif import SarifSink @@ -290,6 +290,7 @@ def confirm_cb(rel_path: str, orig: str, replacement: str, f: Finding) -> bool: decision, filigree_emit=_filigree_status(emit_result), loomweave_write=_loomweave_status(loomweave_result), + migration_hint=baseline_migration_hint(result, decision, root=path, new_since=new_since), ).to_dict(), sort_keys=True, ) @@ -303,11 +304,21 @@ def confirm_cb(rel_path: str, orig: str, replacement: str, f: Finding) -> bool: if not emit_result.reachable: if emit_result.auth_rejected: # Reachable but refused — actionable, NOT "could not reach" (dogfood #5). - click.echo( - f"warning: Filigree returned {emit_result.status} (auth rejected) at {filigree_url}; " - "set WARDLINE_FILIGREE_TOKEN (or .env) to the project token. Findings written locally only.", - err=True, - ) + # Split 401 (no/bad token → set one) from 403 (token present but lacks + # access / blocked → setting a token won't help) so the remedy fits. + if emit_result.status == 403: + click.echo( + f"warning: Filigree returned 403 (forbidden) at {filigree_url}; the token is " + "present but lacks access (scope/permission) or the request is blocked. " + "Findings written locally only.", + err=True, + ) + else: + click.echo( + f"warning: Filigree returned {emit_result.status} (auth rejected) at {filigree_url}; " + "set WARDLINE_FILIGREE_TOKEN (or .env) to the project token. Findings written locally only.", + err=True, + ) elif emit_result.status is not None: click.echo( f"warning: Filigree returned {emit_result.status} (server error) at {filigree_url}; " @@ -397,20 +408,14 @@ def _filigree_status(result: EmitResult | None) -> dict[str, object]: "updated": result.updated, "failed": result.failed, "warnings": list(result.warnings), - "disabled_reason": _filigree_unreachable_reason(result), + "disabled_reason": filigree_disabled_reason( + reachable=result.reachable, + auth_rejected=result.auth_rejected, + status=result.status, + ), } -def _filigree_unreachable_reason(result: EmitResult) -> str | None: - if result.reachable: - return None - if result.auth_rejected: - return f"filigree auth-rejected ({result.status}); set WARDLINE_FILIGREE_TOKEN" - if result.status is not None: - return f"filigree server error ({result.status})" - return "filigree unreachable" - - def _loomweave_status(result: object | None) -> dict[str, object]: if result is None: return { diff --git a/src/wardline/core/agent_summary.py b/src/wardline/core/agent_summary.py index 984b86dd..778d0721 100644 --- a/src/wardline/core/agent_summary.py +++ b/src/wardline/core/agent_summary.py @@ -54,6 +54,10 @@ class AgentSummary: summary_only: bool = False max_findings: int | None = None include_suppressed: bool = True + # The secure-gate-default rollout hint (or None), surfaced in the gate block so the + # "see gate.migration_hint" pointer in next_actions resolves on this surface too — the + # MCP scan response carries the same value at its top-level gate block. + migration_hint: str | None = None def to_dict(self) -> dict[str, Any]: # Counts are whole-project (summary describes the whole project, per the `where` @@ -97,6 +101,7 @@ def to_dict(self) -> dict[str, Any]: "exit_class": self.gate.exit_class, "reason": self.gate.reason, "evaluated": self.gate.evaluated, + "migration_hint": self.migration_hint, }, "integrations": { "filigree_emit": dict(self.filigree_emit), @@ -213,6 +218,7 @@ def build_agent_summary( summary_only: bool = False, max_findings: int | None = None, include_suppressed: bool = True, + migration_hint: str | None = None, ) -> AgentSummary: return AgentSummary( result=result, @@ -223,4 +229,5 @@ def build_agent_summary( summary_only=summary_only, max_findings=max_findings, include_suppressed=include_suppressed, + migration_hint=migration_hint, ) diff --git a/src/wardline/core/filigree_emit.py b/src/wardline/core/filigree_emit.py index caff60d8..b11b9e95 100644 --- a/src/wardline/core/filigree_emit.py +++ b/src/wardline/core/filigree_emit.py @@ -107,6 +107,26 @@ class EmitResult: auth_rejected: bool = False +def filigree_disabled_reason(*, reachable: bool, auth_rejected: bool, status: int | None) -> str | None: + """The ``disabled_reason`` for an emit attempt, or None when Filigree was reached. + + Single source of the auth-rejected (401/403) vs server-error (5xx) vs unreachable + (transport failure) ladder (dogfood #5), shared by the CLI and MCP status blocks so + the two surfaces can never drift. The CLI's human stderr wording (which embeds the + URL and ".env" hint) is intentionally separate. + """ + if reachable: + return None + if auth_rejected: + # 401 → set a token; 403 → token present but lacks access (a token won't help). + if status == 403: + return "filigree forbidden (403); token present but lacks access / blocked" + return f"filigree auth-rejected ({status}); set WARDLINE_FILIGREE_TOKEN" + if status is not None: + return f"filigree server error ({status})" + return "filigree unreachable" + + class Transport(Protocol): def post(self, url: str, body: bytes, headers: Mapping[str, str]) -> Response: ... diff --git a/src/wardline/core/run.py b/src/wardline/core/run.py index dbd12c46..a7356c7e 100644 --- a/src/wardline/core/run.py +++ b/src/wardline/core/run.py @@ -30,7 +30,7 @@ ) from wardline.core.judged import load_judged from wardline.core.protocols import Analyzer -from wardline.core.suppression import apply_suppressions, gate_trips +from wardline.core.suppression import apply_suppressions, gate_trips, severity_gates from wardline.core.waivers import WaiverSet, parse_waivers if TYPE_CHECKING: @@ -91,6 +91,22 @@ class GateDecision: reason: str | None = None evaluated: str | None = None + def __post_init__(self) -> None: + # Enforce the invariants the ``gate_decision`` factory upholds so a *second* + # constructor cannot reintroduce dogfood #2 (a tripped gate that reads as passed). + # exit_class mirrors tripped (0/1); the reserved 2 is a CLI SystemExit, never a + # GateDecision value. + if self.exit_class != (1 if self.tripped else 0): + raise ValueError(f"exit_class {self.exit_class} contradicts tripped={self.tripped}") + # A tripped gate must always carry its verdict — never silently None. + if self.tripped and self.reason is None: + raise ValueError("a tripped gate must carry a reason") + # No threshold (fail_on None) ⟺ no verdict; a threshold always produces both. + if (self.fail_on is None) != (self.reason is None): + raise ValueError("reason must be present iff fail_on is set") + if (self.fail_on is None) != (self.evaluated is None): + raise ValueError("evaluated must be present iff fail_on is set") + def run_scan( root: Path, @@ -343,7 +359,7 @@ def baseline_migration_hint( if f.kind is Kind.DEFECT and f.suppressed is SuppressionState.BASELINED and f.maturity is not Maturity.PREVIEW - and _gates(f.severity, fail_on) + and severity_gates(f.severity, fail_on) ) if not baselined: return None # tripped by waived/judged only — different escape, not this hint @@ -355,13 +371,6 @@ def baseline_migration_hint( ) -def _gates(severity: Severity, fail_on: Severity) -> bool: - from wardline.core.suppression import _RANK - - rank = _RANK.get(severity) - return rank is not None and rank >= _RANK[fail_on] - - def _gate_reason(result: ScanResult, fail_on: Severity, *, tripped: bool, honors_suppressions: bool) -> str: """The human verdict string, counted over the ACTUAL gate population so the numbers are exactly what tripped it.""" @@ -388,7 +397,7 @@ def _gate_reason(result: ScanResult, fail_on: Severity, *, tripped: bool, honors for f in gate_pop: if f.kind is not Kind.DEFECT or f.maturity is Maturity.PREVIEW: continue - if f.suppressed is not SuppressionState.ACTIVE or not _gates(f.severity, fail_on): + if f.suppressed is not SuppressionState.ACTIVE or not severity_gates(f.severity, fail_on): continue if emitted_state.get(f.fingerprint, SuppressionState.ACTIVE) is SuppressionState.ACTIVE: active += 1 diff --git a/src/wardline/core/suppression.py b/src/wardline/core/suppression.py index 197e7b75..c3071617 100644 --- a/src/wardline/core/suppression.py +++ b/src/wardline/core/suppression.py @@ -73,13 +73,20 @@ def apply_suppressions( return out +def severity_gates(severity: Severity, fail_on: Severity) -> bool: + """True iff ``severity`` is a known gate severity at or above the ``fail_on`` + threshold. NONE (facts/metrics, absent from ``_RANK``) never gates.""" + rank = _RANK.get(severity) + return rank is not None and rank >= _RANK[fail_on] + + def gate_trips(findings: Iterable[Finding], fail_on: Severity) -> bool: """True iff any ACTIVE Kind.DEFECT finding has severity >= fail_on.""" threshold = _RANK[fail_on] for f in findings: if f.kind is not Kind.DEFECT or f.suppressed is not SuppressionState.ACTIVE: continue - if f.maturity == Maturity.PREVIEW: + if f.maturity is Maturity.PREVIEW: continue rank = _RANK.get(f.severity) if rank is not None and rank >= threshold: @@ -102,7 +109,7 @@ def gate_breakdown(findings: Iterable[Finding], fail_on: Severity) -> tuple[int, active = 0 suppressed = 0 for f in findings: - if f.kind is not Kind.DEFECT or f.maturity == Maturity.PREVIEW: + if f.kind is not Kind.DEFECT or f.maturity is Maturity.PREVIEW: continue rank = _RANK.get(f.severity) if rank is None or rank < threshold: diff --git a/src/wardline/mcp/server.py b/src/wardline/mcp/server.py index 14434232..f29ffd12 100644 --- a/src/wardline/mcp/server.py +++ b/src/wardline/mcp/server.py @@ -22,7 +22,7 @@ from wardline.core.baseline import generate_baseline, load_baseline from wardline.core.errors import WardlineError from wardline.core.explain import explain_chain, explain_finding, explanation_from_context -from wardline.core.filigree_emit import FiligreeEmitter +from wardline.core.filigree_emit import FiligreeEmitter, filigree_disabled_reason from wardline.core.finding import Finding, Kind, Severity, SuppressionState from wardline.core.finding_query import filter_findings from wardline.core.judge_run import run_judge @@ -77,15 +77,11 @@ def _filigree_emit_status(block: dict[str, Any] | None) -> dict[str, Any]: "warnings": [], "disabled_reason": "not configured", } - reachable = block.get("reachable") - if reachable: - disabled_reason = None - elif block.get("auth_rejected"): - disabled_reason = f"filigree auth-rejected ({block.get('status')}); set WARDLINE_FILIGREE_TOKEN" - elif block.get("status") is not None: - disabled_reason = f"filigree server error ({block.get('status')})" - else: - disabled_reason = "filigree unreachable" + disabled_reason = filigree_disabled_reason( + reachable=bool(block.get("reachable")), + auth_rejected=bool(block.get("auth_rejected")), + status=block.get("status"), + ) return {"configured": True, "disabled_reason": disabled_reason, **block} @@ -182,6 +178,18 @@ def _cache_dir_arg(args: dict[str, Any], root: Path) -> Path | None: return _resolve_under_root(root, args["cache_dir"]) if args.get("cache_dir") else None +def _bool_arg(args: dict[str, Any], name: str, default: bool) -> bool: + # Reject non-bool values loudly rather than ``bool(...)``-coercing them: a JSON string + # like "false" would otherwise coerce to True, silently inverting intent. Matches the + # strict (agent-actionable) validation max_findings already gets. + val = args.get(name) + if val is None: + return default + if not isinstance(val, bool): + raise ToolError(f"{name} must be a boolean") + return val + + def _scan( args: dict[str, Any], root: Path, @@ -250,15 +258,14 @@ def _scan( # Payload-shrinking controls (dogfood #4). The `summary`/`gate` blocks always # describe the WHOLE project; these only bound the returned finding bodies. - summary_only = bool(args.get("summary_only") or False) - raw_include = args.get("include_suppressed") - include_suppressed = True if raw_include is None else bool(raw_include) + summary_only = _bool_arg(args, "summary_only", False) + include_suppressed = _bool_arg(args, "include_suppressed", True) max_findings = args.get("max_findings") if max_findings is not None and ( not isinstance(max_findings, int) or isinstance(max_findings, bool) or max_findings < 0 ): raise ToolError("max_findings must be a non-negative integer") - explain = bool(args.get("explain")) + explain = _bool_arg(args, "explain", False) # include_suppressed:false drops the suppressed DEFECT bodies (counts stay whole). if not include_suppressed: @@ -344,6 +351,7 @@ def _scan( summary_only=summary_only, max_findings=max_findings, include_suppressed=include_suppressed, + migration_hint=migration_hint, ).to_dict(), } _attach_legis_artifact( @@ -394,7 +402,7 @@ def _attach_legis_artifact( strict_defaults=strict_defaults, ) key_bytes = key_str.encode("utf-8") if key_str else None - allow_dirty = bool(args.get("allow_dirty") or False) + allow_dirty = _bool_arg(args, "allow_dirty", False) status: dict[str, Any] = { "configured": True, "signed": False, @@ -413,6 +421,14 @@ def _attach_legis_artifact( dirty = bool(artifact.get("dirty")) status["signed"] = key_bytes is not None and not dirty status["dirty"] = dirty + if dirty: + # Match the CLI's loudness on the agent surface: the artifact is UNSIGNED and legis + # records it unverified — say so and say "never gate CI on it" rather than leaving + # the agent to infer it from signed:false / dirty:true alone (agent-first). + status["reason"] = ( + "dirty working tree — emitted an UNSIGNED legis dev artifact (legis records it " + "unverified); never gate CI on it. Commit for a signed artifact." + ) response["legis_artifact"] = artifact response["legis_artifact_status"] = status @@ -861,9 +877,10 @@ def _register_tools(self) -> None: }, "max_findings": { "type": "integer", + "minimum": 0, "description": "Cap the number of returned finding bodies (and inlined " - "explanations). The cut is reported in the truncation block; summary counts " - "stay whole-project.", + "explanations). Must be a non-negative integer. The cut is reported in the " + "truncation block; summary counts stay whole-project.", }, "include_suppressed": { "type": "boolean", diff --git a/tests/unit/cli/test_cli.py b/tests/unit/cli/test_cli.py index a36f4a4e..d93cf20b 100644 --- a/tests/unit/cli/test_cli.py +++ b/tests/unit/cli/test_cli.py @@ -799,6 +799,52 @@ def emit(self, findings, *, scanned_paths=()): assert "wardline_filigree_token" in low +def _emitter_returning(status, *, auth_rejected): + """A FiligreeEmitter stand-in that always returns a canned soft EmitResult.""" + + class _E: + def __init__(self, url, **kw): + pass + + def emit(self, findings, *, scanned_paths=()): + from wardline.core.filigree_emit import EmitResult + + return EmitResult(reachable=False, status=status, auth_rejected=auth_rejected) + + return _E + + +def test_scan_filigree_403_says_forbidden_not_set_a_token(tmp_path, monkeypatch) -> None: + # A 403 is reachable-but-refused like a 401, but "set WARDLINE_FILIGREE_TOKEN" is the + # wrong remedy — the token is present and lacks access. Say "forbidden", not the env var. + proj = tmp_path / "proj" + proj.mkdir() + _write(proj, "svc.py", _LEAKY) + monkeypatch.setattr("wardline.cli.scan.FiligreeEmitter", _emitter_returning(403, auth_rejected=True)) + out = tmp_path / "f.jsonl" + result = CliRunner().invoke(scan, [str(proj), "--output", str(out), "--filigree-url", "http://x"]) + assert result.exit_code == 0, result.output + low = result.output.lower() + assert "403" in result.output and "forbidden" in low + assert "wardline_filigree_token" not in low + assert "could not reach" not in low + + +def test_scan_filigree_5xx_says_server_error_not_unreachable(tmp_path, monkeypatch) -> None: + # A 5xx outage reached us: distinct from the 401 auth case and the genuine + # transport-unreachable case. Must say "server error", never "could not reach". + proj = tmp_path / "proj" + proj.mkdir() + _write(proj, "svc.py", _LEAKY) + monkeypatch.setattr("wardline.cli.scan.FiligreeEmitter", _emitter_returning(503, auth_rejected=False)) + out = tmp_path / "f.jsonl" + result = CliRunner().invoke(scan, [str(proj), "--output", str(out), "--filigree-url", "http://x"]) + assert result.exit_code == 0, result.output + low = result.output.lower() + assert "503" in result.output and "server error" in low + assert "could not reach" not in low + + # --- SP9: wardline scan --loomweave-url --------------------------------------- # scan.py imports write_facts_to_loomweave lazily inside the `if loomweave_url` block # (`from wardline.loomweave.write import write_facts_to_loomweave`), so the binding diff --git a/tests/unit/core/test_agent_summary.py b/tests/unit/core/test_agent_summary.py index d6f7d531..62f3c26f 100644 --- a/tests/unit/core/test_agent_summary.py +++ b/tests/unit/core/test_agent_summary.py @@ -64,6 +64,28 @@ def test_agent_summary_gate_block_carries_reason_and_evaluated(tmp_path: Path) - assert "unsuppressed" in out["gate"]["evaluated"] +def test_agent_summary_gate_block_carries_migration_hint(tmp_path: Path) -> None: + # The "see gate.migration_hint" pointer in next_actions must resolve on THIS surface: + # the agent_summary gate block carries the rollout hint too, not only the MCP scan + # top-level gate block (the dangling-pointer fix). + from wardline.core.run import baseline_migration_hint + + (tmp_path / "svc.py").write_text(_LEAKY, encoding="utf-8") + scan = run_scan(tmp_path) + bl = tmp_path / ".wardline" / "baseline.yaml" + bl.parent.mkdir(parents=True, exist_ok=True) + write_baseline(bl, [next(f for f in scan.findings if f.rule_id == "PY-WL-101")]) + rescan = run_scan(tmp_path) + decision = gate_decision(rescan, Severity.ERROR) + hint = baseline_migration_hint(rescan, decision, root=tmp_path, new_since=None) + assert hint is not None # baselined-only trip with a committed baseline -> a hint + out = build_agent_summary(rescan, decision, migration_hint=hint).to_dict() + assert out["gate"]["migration_hint"] == hint + # The field is present (and None) when no hint is threaded — the key never disappears. + out_default = build_agent_summary(rescan, decision).to_dict() + assert out_default["gate"]["migration_hint"] is None + + def test_agent_summary_no_active_defects_still_has_next_actions(tmp_path: Path) -> None: (tmp_path / "svc.py").write_text("def f():\n return 1\n", encoding="utf-8") scan = run_scan(tmp_path) diff --git a/tests/unit/core/test_run.py b/tests/unit/core/test_run.py index f807bc8d..7dbc7819 100644 --- a/tests/unit/core/test_run.py +++ b/tests/unit/core/test_run.py @@ -9,6 +9,7 @@ from wardline.core.finding import Finding, Kind, Location, Severity, SuppressionState from wardline.core.judged import JudgedFP, write_judged from wardline.core.run import ( + GateDecision, ScanResult, ScanSummary, baseline_migration_hint, @@ -228,6 +229,43 @@ def test_gate_decision_reason_names_active_defect_on_genuine_trip(tmp_path: Path assert "--trust-suppressions" not in decision.reason +def test_gate_decision_reason_names_both_active_and_suppressed_on_mixed_trip(tmp_path: Path) -> None: + # The mixed branch of _gate_reason: one genuinely-active defect AND one baselined + # defect both gate by default. The verdict must name BOTH counts (not collapse to + # one), so the agent sees the real composition of the trip. + proj = tmp_path / "proj" + proj.mkdir() + (proj / "a.py").write_text(_LEAKY, encoding="utf-8") + (proj / "b.py").write_text(_LEAKY, encoding="utf-8") + # Baseline ONLY a.py's finding (fingerprint match); b.py stays active. + fp_a = next( + f.fingerprint + for f in run_scan(proj).findings + if f.rule_id == "PY-WL-101" and f.location.path == "a.py" + ) + _write_baseline(proj, fp_a) + decision = gate_decision(run_scan(proj), Severity.ERROR) + assert decision.tripped is True + assert decision.reason is not None + assert "1 active + 1 suppressed" in decision.reason + assert "--trust-suppressions" in decision.reason + + +def test_gate_decision_rejects_contradictory_construction() -> None: + # The __post_init__ invariant guard: GateDecision must make "tripped gate that reads + # as passed" (dogfood #2) unconstructible, not merely avoided by the factory. + with pytest.raises(ValueError, match="exit_class"): + GateDecision(tripped=True, fail_on="error", exit_class=0, reason="x", evaluated="y") + with pytest.raises(ValueError, match="reason"): + GateDecision(tripped=True, fail_on="error", exit_class=1, reason=None, evaluated="y") + with pytest.raises(ValueError, match="reason"): + # fail_on set but no verdict — the no-gate shape leaking into a gated decision. + GateDecision(tripped=False, fail_on="error", exit_class=0, reason=None, evaluated=None) + # The two legitimate shapes the factory produces still construct cleanly. + GateDecision(tripped=False, fail_on=None, exit_class=0) + GateDecision(tripped=True, fail_on="error", exit_class=1, reason="1 active", evaluated="unsuppressed") + + def test_gate_decision_evaluated_reflects_trust_suppressions(tmp_path: Path) -> None: proj, fp = _leaky_proj(tmp_path) _write_baseline(proj, fp) diff --git a/tests/unit/mcp/test_server_filigree_emit.py b/tests/unit/mcp/test_server_filigree_emit.py index 7e1fc0e8..b8fdfcee 100644 --- a/tests/unit/mcp/test_server_filigree_emit.py +++ b/tests/unit/mcp/test_server_filigree_emit.py @@ -141,3 +141,29 @@ def test_scan_filigree_401_surfaces_auth_reason_to_agent(tmp_path): reason = out["filigree_emit"]["disabled_reason"] assert "401" in reason and "WARDLINE_FILIGREE_TOKEN" in reason assert "unreachable" not in reason + + +def test_scan_filigree_403_says_forbidden_not_set_a_token(tmp_path): + # A 403 is auth-rejected too, but "set WARDLINE_FILIGREE_TOKEN" is the wrong remedy + # (the token is present and lacks access / is blocked). The reason must say forbidden, + # not point at the env var. + (tmp_path / "svc.py").write_text(_LEAKY, encoding="utf-8") + out = _scan({}, tmp_path, None, FakeEmitter(EmitResult(reachable=False, status=403, auth_rejected=True))) + assert out["filigree"]["reachable"] is False # still soft + reason = out["filigree_emit"]["disabled_reason"] + assert "403" in reason and "forbidden" in reason + assert "WARDLINE_FILIGREE_TOKEN" not in reason + assert "unreachable" not in reason + + +def test_scan_filigree_5xx_says_server_error_not_unreachable(tmp_path): + # A 5xx outage reached us (the sibling is degraded, not absent). The disabled_reason + # must say "server error (503)", distinct from both the 401 auth case and the genuine + # transport-unreachable case (dogfood #5, the untested sibling of the 401 path). + (tmp_path / "svc.py").write_text(_LEAKY, encoding="utf-8") + out = _scan({}, tmp_path, None, FakeEmitter(EmitResult(reachable=False, status=503))) + assert out["filigree"]["reachable"] is False # still soft + reason = out["filigree_emit"]["disabled_reason"] + assert "503" in reason and "server error" in reason + assert "unreachable" not in reason + assert "WARDLINE_FILIGREE_TOKEN" not in reason diff --git a/tests/unit/mcp/test_server_legis_artifact.py b/tests/unit/mcp/test_server_legis_artifact.py new file mode 100644 index 00000000..22da75fa --- /dev/null +++ b/tests/unit/mcp/test_server_legis_artifact.py @@ -0,0 +1,102 @@ +"""MCP `scan` legis-artifact attachment (`_attach_legis_artifact`). + +The MCP scan path has its own dirty/signed status projection distinct from core +`build_legis_artifact`: it reads `allow_dirty` from the args and computes +`status["signed"] = key present and not dirty`. These tests pin that projection — +the core/CLI layers are covered in test_legis_artifact.py / test_cli.py. + +Every test `delenv`s the ambient key first: an inherited WARDLINE_LEGIS_ARTIFACT_KEY +would otherwise provision signing where a test means "no key". +""" + +from __future__ import annotations + +import subprocess + +from wardline.core.legis import LEGIS_ARTIFACT_KEY_ENV +from wardline.mcp.server import _scan + +_LEAKY = ( + "from wardline.decorators import external_boundary, trusted\n" + "@external_boundary\ndef read_raw(p):\n return p\n" + "@trusted\ndef leaky(p):\n return read_raw(p)\n" +) + + +def _git(repo, *args: str) -> None: + subprocess.run(["git", *args], cwd=repo, check=True, capture_output=True) + + +def _committed_repo(tmp_path): + repo = tmp_path / "proj" + repo.mkdir() + (repo / "svc.py").write_text(_LEAKY, encoding="utf-8") + _git(repo, "init", "-q") + _git(repo, "config", "user.email", "t@example.com") + _git(repo, "config", "user.name", "t") + _git(repo, "add", "-A") + _git(repo, "commit", "-qm", "init") + return repo + + +def test_legis_not_attached_unless_requested(tmp_path, monkeypatch) -> None: + # No key provisioned and no legis_artifact arg -> the response is byte-unchanged. + monkeypatch.delenv(LEGIS_ARTIFACT_KEY_ENV, raising=False) + repo = _committed_repo(tmp_path) + out = _scan({}, repo, None, None) + assert "legis_artifact" not in out + assert "legis_artifact_status" not in out + + +def test_legis_artifact_unsigned_when_no_key(tmp_path, monkeypatch) -> None: + # legis_artifact:true with no key -> attach an unsigned artifact (legis optional-verify). + monkeypatch.delenv(LEGIS_ARTIFACT_KEY_ENV, raising=False) + repo = _committed_repo(tmp_path) + out = _scan({"legis_artifact": True}, repo, None, None) + assert "legis_artifact" in out + status = out["legis_artifact_status"] + assert status["configured"] is True + assert status["signed"] is False + assert "artifact_signature" not in out["legis_artifact"] + + +def test_legis_clean_tree_with_key_is_signed(tmp_path, monkeypatch) -> None: + # The positive arm of `signed = key and not dirty`: a key present on a CLEAN tree signs. + monkeypatch.setenv(LEGIS_ARTIFACT_KEY_ENV, "testsecret") + repo = _committed_repo(tmp_path) + out = _scan({}, repo, None, None) # a provisioned key activates the block without the arg + status = out["legis_artifact_status"] + assert status["signed"] is True + assert status.get("dirty") is False + assert out["legis_artifact"]["artifact_signature"].startswith("hmac-sha256:") + + +def test_legis_dirty_tree_with_key_reports_unsigned_with_loud_reason(tmp_path, monkeypatch) -> None: + # The MCP-only projection arm that matters: a dirty tree is NOT signed even with a key + # present (false-provenance guard) -> signed:false, dirty:true, and a loud reason + # (agent-first parity with the CLI's "never gate CI on it" warning). + monkeypatch.setenv(LEGIS_ARTIFACT_KEY_ENV, "testsecret") + repo = _committed_repo(tmp_path) + (repo / "svc.py").write_text(_LEAKY + "\n# dirty\n", encoding="utf-8") + out = _scan({"allow_dirty": True}, repo, None, None) + status = out["legis_artifact_status"] + assert status["signed"] is False # despite the key — the dirty arm forces it + assert status["dirty"] is True + assert status["reason"] is not None and "UNSIGNED" in status["reason"] + assert "never gate CI" in status["reason"] + assert out["legis_artifact"]["dirty"] is True + assert "artifact_signature" not in out["legis_artifact"] + + +def test_legis_dirty_tree_with_key_no_allow_dirty_refuses_softly(tmp_path, monkeypatch) -> None: + # Key present + dirty tree + NO allow_dirty -> signing refused, fail-soft: no postable + # artifact, status carries the refusal reason, the scan itself still succeeds. + monkeypatch.setenv(LEGIS_ARTIFACT_KEY_ENV, "testsecret") + repo = _committed_repo(tmp_path) + (repo / "svc.py").write_text(_LEAKY + "\n# dirty\n", encoding="utf-8") + out = _scan({}, repo, None, None) + status = out["legis_artifact_status"] + assert status["signed"] is False + assert status["reason"] is not None + assert "legis_artifact" not in out # no postable artifact on a refusal + assert out["summary"]["total"] >= 1 # scan unaffected diff --git a/tests/unit/mcp/test_server_query_explain.py b/tests/unit/mcp/test_server_query_explain.py index 100fdd36..0eaea606 100644 --- a/tests/unit/mcp/test_server_query_explain.py +++ b/tests/unit/mcp/test_server_query_explain.py @@ -159,3 +159,21 @@ def test_max_findings_caps_and_marks(tmp_path): assert out["truncation"]["findings_truncated"] is True assert out["truncation"]["findings_returned"] == 3 assert out["truncation"]["findings_total"] >= 10 + + +@pytest.mark.parametrize("bad", [-1, 1.5, "3", True]) +def test_max_findings_rejects_non_negative_integer(tmp_path, bad): + # Agent-actionable validation: a negative / non-int / bool max_findings is a loud + # ToolError, never a silent negative-slice that drops the last finding. + (tmp_path / "svc.py").write_text(_SRC, encoding="utf-8") + with pytest.raises(ToolError, match="max_findings"): + _scan({"max_findings": bad}, tmp_path) + + +@pytest.mark.parametrize("name", ["summary_only", "include_suppressed"]) +def test_boolean_payload_controls_reject_non_bool(tmp_path, name): + # The string "false" must NOT silently coerce to True (the bug the strict _bool_arg + # closes) — a non-bool is rejected loudly, matching max_findings' strictness. + (tmp_path / "svc.py").write_text(_SRC, encoding="utf-8") + with pytest.raises(ToolError, match=name): + _scan({name: "false"}, tmp_path) From 77e1d8e2ebfafe54ae397688f0444ae3b552cdda Mon Sep 17 00:00:00 2001 From: John Morrissey Date: Sat, 6 Jun 2026 22:30:11 +1000 Subject: [PATCH 13/17] chore(release): cut 1.0.0rc2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Increment the release candidate (rc1 → rc2) to carry the PR #30 review hardening (gate invariant, 403/5xx distinction, strict MCP arg validation). Co-Authored-By: Claude Opus 4.8 (1M context) --- src/wardline/_version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/wardline/_version.py b/src/wardline/_version.py index 6c93125c..5a662149 100644 --- a/src/wardline/_version.py +++ b/src/wardline/_version.py @@ -1 +1 @@ -__version__ = "1.0.0rc1" +__version__ = "1.0.0rc2" From 83c66e6e527be23c2b7999d2e9a48cf3cee8cc71 Mon Sep 17 00:00:00 2001 From: John Morrissey Date: Sat, 6 Jun 2026 22:54:27 +1000 Subject: [PATCH 14/17] docs(release): stamp 1.0.0rc2 changelog, fix stale README output, robust version test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - CHANGELOG: stamp the accumulated [Unreleased] work as [1.0.0rc2] - 2026-06-06 and open a fresh empty [Unreleased]; consolidate the two `### Added` blocks into one (no content change, removes a Keep-a-Changelog duplicate-section smell). - README: the quick-start scan output said "1 new" — corrected to "1 active", matching the CLI relabel shipped in this same release (and getting-started.md). - test_package: assert __version__ starts with "1.0.0" (release line) instead of the exact rc suffix, so cutting a new rc no longer breaks the test. Suite 2515 passed; ruff clean; mkdocs --strict builds. Co-Authored-By: Claude Opus 4.8 (1M context) --- CHANGELOG.md | 114 ++++++++++++++++++------------------- README.md | 2 +- tests/unit/test_package.py | 3 +- 3 files changed, 60 insertions(+), 59 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ff80ab1d..930ac042 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [1.0.0rc2] - 2026-06-06 + ### Added - **MCP `scan` payload controls — `where` now shrinks the payload, plus `summary_only` / `max_findings` / `include_suppressed` and a default explain cap @@ -41,6 +43,61 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 `gate.migration_hint`) pointing at the escape hatches and the new **`UPGRADING.md`**. This is the "my repo went red with no code change" case made self-explaining; the secure default itself is unchanged. +- Live Loomweave port resolution (consumer half of Loomweave **ADR-044**): Wardline + now reads Loomweave's published read-API port from `/.loomweave/ephemeral.port` + and inserts it into `resolve_loomweave_url` precedence as `flag > env > published + port > wardline.yaml`. A live serve's real port self-heals over a stale/default + literal in `wardline.yaml`, so a mis-pinned URL no longer silently strands + federation for a second project (the failure ADR-034's instance-ID guard catches + as `PROJECT_MISMATCH`). Read-never-compute, loopback-by-construction, fail-soft + (missing / malformed / out-of-range / unreadable → fall through to config); skipped + under `strict_defaults`. A deliberate `--loomweave-url` flag or env var still always + wins. No change to wire behaviour or the HMAC signer. +- Signed scan handoff to **legis** (the Weft governance plugin): `wardline scan + --format legis` (CLI) and an opt-in `legis_artifact` block on the MCP `scan` result + produce the verbatim-postable `scan` for legis's `POST /wardline/scan-results`. The + artifact carries four provenance fields (`scanner_identity`, `rule_set_version`, + `commit_sha`, `tree_sha`) and an `artifact_signature` — `hmac-sha256:v2:` over + legis-canonical JSON (sorted-key, tight-separator, non-ASCII-preserved), byte-exact + with legis's signer (pinned by a golden vector captured from real legis). The shared + secret is read from `WARDLINE_LEGIS_ARTIFACT_KEY` (env or `.env`); unset → unsigned + with `unverified` provenance. Signing refuses a dirty / non-git tree (false + provenance); the MCP block is fail-soft, the CLI is loud (exit 2). The artifact carries + the **whole scan**, each finding projected onto legis's accepted vocabulary — `properties` + filtered to the eight trust tiers (diagnostics like `sink`/`callee`/`markers` + dropped; the rich MCP/SARIF/Loomweave wire is unchanged), suppression proof carried in + `properties`, and `baselined`/`judged` mapped onto legis's `suppressed`. `active` + stays `active`, so legis reproduces Wardline's gate population exactly (one judge); + legis enforces its own 500-finding cap (a larger scan is rejected loudly, never silently truncated). + The hermetic conformance test now mirrors legis's *full* ingest validation (trust + tiers, suppression proof, supported states), closing the prior false-green. See + [Signed scan handoff to legis](guides/legis-handoff.md). +- `wardline assure` CLI and MCP `assure` tool: trust-surface COVERAGE posture — how many + declared trust boundaries (`@external_boundary` / `@trust_boundary` / `@trusted`) the + engine reached a definite verdict on vs. how many are honestly unknown (`unknown` list), + plus a `waiver_debt` rollup (days-to-expiry per configured waiver, lapsed entries + surfaced not dropped). Zero-config — reads what every scan already computes. +- `wardline attest` CLI and MCP `attest` / `verify_attestation` tools: signed, reproducible + evidence bundle (`schema: wardline-attest-1`) capturing commit, ruleset hash, the full + assurance posture, and per-boundary verdicts. HMAC-SHA256 signed with an install-minted + project key (`wardline install` appends `WARDLINE_ATTEST_KEY` to `.env`). The CLI and MCP + default to refusing a dirty working tree (`--allow-dirty` / `allow_dirty: true` to + override, records `dirty: true` honestly). `verify_attestation` checks signature (offline) + and optionally re-derives the payload at the current tree (`--reproduce` / `reproduce: + true`). SEI-keyed boundaries opt-in via `--loomweave-url` (fail-soft). +- `file_finding` (MCP tool + `wardline file-finding` CLI): file ONE finding by fingerprint + into a tracked Filigree issue, returning its id (idempotent, fail-soft). Scan emission now + sets `mark_unseen=True` (non-empty scans) so a fixed finding enters Filigree's + `unseen_in_latest` state and a regressed one reopens its linked issue on the next scan. + (Issue close-on-fixed is gated on Filigree's clean-stale sweep.) (WS-A2) +- MCP `scan` now emits findings to Filigree when a `--filigree-url` is configured, at + parity with the CLI (a `filigree` block in the scan result; fail-soft — an unreachable + sibling or rejected payload is reported, never fails the scan). Closes the CLI/MCP + finding-emission asymmetry. (WS-A1) +- MCP `scan` gains a server-side `where` filter (rule_id/qualname/severity/suppression/kind/ + path_glob/sink/tier) and an `explain: true` mode that inlines each active defect's taint + provenance — killing the scan-then-N-explains round-trips. New read-only `wardline findings` + CLI verb shares the same filter core. (WS-B1, WS-B2) ### Fixed - **`next_actions` is gate-aware — never reads as "passed" when the gate failed @@ -132,63 +189,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 returning `http://localhost:/api/weft/scan-results` to match `install/detect.py`'s writer. A live dashboard on a new port self-heals over a stale install-stamped literal. -### Added -- Live Loomweave port resolution (consumer half of Loomweave **ADR-044**): Wardline - now reads Loomweave's published read-API port from `/.loomweave/ephemeral.port` - and inserts it into `resolve_loomweave_url` precedence as `flag > env > published - port > wardline.yaml`. A live serve's real port self-heals over a stale/default - literal in `wardline.yaml`, so a mis-pinned URL no longer silently strands - federation for a second project (the failure ADR-034's instance-ID guard catches - as `PROJECT_MISMATCH`). Read-never-compute, loopback-by-construction, fail-soft - (missing / malformed / out-of-range / unreadable → fall through to config); skipped - under `strict_defaults`. A deliberate `--loomweave-url` flag or env var still always - wins. No change to wire behaviour or the HMAC signer. -- Signed scan handoff to **legis** (the Weft governance plugin): `wardline scan - --format legis` (CLI) and an opt-in `legis_artifact` block on the MCP `scan` result - produce the verbatim-postable `scan` for legis's `POST /wardline/scan-results`. The - artifact carries four provenance fields (`scanner_identity`, `rule_set_version`, - `commit_sha`, `tree_sha`) and an `artifact_signature` — `hmac-sha256:v2:` over - legis-canonical JSON (sorted-key, tight-separator, non-ASCII-preserved), byte-exact - with legis's signer (pinned by a golden vector captured from real legis). The shared - secret is read from `WARDLINE_LEGIS_ARTIFACT_KEY` (env or `.env`); unset → unsigned - with `unverified` provenance. Signing refuses a dirty / non-git tree (false - provenance); the MCP block is fail-soft, the CLI is loud (exit 2). The artifact carries - the **whole scan**, each finding projected onto legis's accepted vocabulary — `properties` - filtered to the eight trust tiers (diagnostics like `sink`/`callee`/`markers` - dropped; the rich MCP/SARIF/Loomweave wire is unchanged), suppression proof carried in - `properties`, and `baselined`/`judged` mapped onto legis's `suppressed`. `active` - stays `active`, so legis reproduces Wardline's gate population exactly (one judge); - legis enforces its own 500-finding cap (a larger scan is rejected loudly, never silently truncated). - The hermetic conformance test now mirrors legis's *full* ingest validation (trust - tiers, suppression proof, supported states), closing the prior false-green. See - [Signed scan handoff to legis](guides/legis-handoff.md). -- `wardline assure` CLI and MCP `assure` tool: trust-surface COVERAGE posture — how many - declared trust boundaries (`@external_boundary` / `@trust_boundary` / `@trusted`) the - engine reached a definite verdict on vs. how many are honestly unknown (`unknown` list), - plus a `waiver_debt` rollup (days-to-expiry per configured waiver, lapsed entries - surfaced not dropped). Zero-config — reads what every scan already computes. -- `wardline attest` CLI and MCP `attest` / `verify_attestation` tools: signed, reproducible - evidence bundle (`schema: wardline-attest-1`) capturing commit, ruleset hash, the full - assurance posture, and per-boundary verdicts. HMAC-SHA256 signed with an install-minted - project key (`wardline install` appends `WARDLINE_ATTEST_KEY` to `.env`). The CLI and MCP - default to refusing a dirty working tree (`--allow-dirty` / `allow_dirty: true` to - override, records `dirty: true` honestly). `verify_attestation` checks signature (offline) - and optionally re-derives the payload at the current tree (`--reproduce` / `reproduce: - true`). SEI-keyed boundaries opt-in via `--loomweave-url` (fail-soft). -- `file_finding` (MCP tool + `wardline file-finding` CLI): file ONE finding by fingerprint - into a tracked Filigree issue, returning its id (idempotent, fail-soft). Scan emission now - sets `mark_unseen=True` (non-empty scans) so a fixed finding enters Filigree's - `unseen_in_latest` state and a regressed one reopens its linked issue on the next scan. - (Issue close-on-fixed is gated on Filigree's clean-stale sweep.) (WS-A2) -- MCP `scan` now emits findings to Filigree when a `--filigree-url` is configured, at - parity with the CLI (a `filigree` block in the scan result; fail-soft — an unreachable - sibling or rejected payload is reported, never fails the scan). Closes the CLI/MCP - finding-emission asymmetry. (WS-A1) -- MCP `scan` gains a server-side `where` filter (rule_id/qualname/severity/suppression/kind/ - path_glob/sink/tier) and an `explain: true` mode that inlines each active defect's taint - provenance — killing the scan-then-N-explains round-trips. New read-only `wardline findings` - CLI verb shares the same filter core. (WS-B1, WS-B2) - ### Security - **Builtin trust-marker decorators are now trusted only when they resolve to the real exports — closes a spoofable false-green.** The default decorator seeding diff --git a/README.md b/README.md index 4140bcf9..de3142c2 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ def build_record(req): ```console $ wardline scan . --fail-on ERROR -scanned 1 file(s); 3 finding(s) — 0 suppressed (0 baseline / 0 waiver / 0 judged), 1 new -> findings.jsonl +scanned 1 file(s); 3 finding(s) — 0 suppressed (0 baseline / 0 waiver / 0 judged), 1 active -> findings.jsonl $ echo $? 1 ``` diff --git a/tests/unit/test_package.py b/tests/unit/test_package.py index 8214cd0c..3bb6369c 100644 --- a/tests/unit/test_package.py +++ b/tests/unit/test_package.py @@ -3,4 +3,5 @@ def test_version_is_exported() -> None: assert isinstance(wardline.__version__, str) - assert wardline.__version__.startswith("1.0.0rc1") + # Pin the release line, not the rc suffix, so cutting a new rc doesn't break this. + assert wardline.__version__.startswith("1.0.0") From 6f2c1d6e4550a5309b5dbda3c55484ef4f48f38e Mon Sep 17 00:00:00 2001 From: John Morrissey Date: Sat, 6 Jun 2026 22:57:40 +1000 Subject: [PATCH 15/17] style: apply ruff format to satisfy the CI format gate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `ruff format --check src tests` (run in CI's Lint+Format job) was red. Reformats 6 test files: two touched in this rc2 work (test_run.py, test_server_query_explain.py), test_variable_level.py (dogfood branch change), and three with pre-existing drift already on main (test_legis_intake_contract.py, test_client.py, test_sei_client_wire.py) — the gate checks the whole tree, so all six must be clean. Formatting only; no behavior change. Suite 2515 passed; ruff check + format + mypy all clean. Co-Authored-By: Claude Opus 4.8 (1M context) --- tests/conformance/test_legis_intake_contract.py | 4 +--- tests/unit/core/test_run.py | 4 +--- tests/unit/loomweave/test_client.py | 8 ++++++-- tests/unit/loomweave/test_sei_client_wire.py | 16 ++++++++++++---- tests/unit/mcp/test_server_query_explain.py | 1 + tests/unit/scanner/taint/test_variable_level.py | 8 +------- 6 files changed, 22 insertions(+), 19 deletions(-) diff --git a/tests/conformance/test_legis_intake_contract.py b/tests/conformance/test_legis_intake_contract.py index d70e8765..bded5e04 100644 --- a/tests/conformance/test_legis_intake_contract.py +++ b/tests/conformance/test_legis_intake_contract.py @@ -247,9 +247,7 @@ def test_legis_gate_population_equals_wardline_gate_active_count(tmp_path: Path) # summary.active, which counts active in the (possibly suppressed) emitted findings. scan, result = _artifact(_proj(tmp_path)) gate_population = result.gate_findings if result.gate_findings is not None else result.findings - gate_active = sum( - 1 for f in gate_population if f.kind is Kind.DEFECT and f.suppressed is SuppressionState.ACTIVE - ) + gate_active = sum(1 for f in gate_population if f.kind is Kind.DEFECT and f.suppressed is SuppressionState.ACTIVE) assert len(active_defects(scan)) == gate_active assert gate_active >= 1 diff --git a/tests/unit/core/test_run.py b/tests/unit/core/test_run.py index 7dbc7819..13919cb1 100644 --- a/tests/unit/core/test_run.py +++ b/tests/unit/core/test_run.py @@ -239,9 +239,7 @@ def test_gate_decision_reason_names_both_active_and_suppressed_on_mixed_trip(tmp (proj / "b.py").write_text(_LEAKY, encoding="utf-8") # Baseline ONLY a.py's finding (fingerprint match); b.py stays active. fp_a = next( - f.fingerprint - for f in run_scan(proj).findings - if f.rule_id == "PY-WL-101" and f.location.path == "a.py" + f.fingerprint for f in run_scan(proj).findings if f.rule_id == "PY-WL-101" and f.location.path == "a.py" ) _write_baseline(proj, fp_a) decision = gate_decision(run_scan(proj), Severity.ERROR) diff --git a/tests/unit/loomweave/test_client.py b/tests/unit/loomweave/test_client.py index 027d321c..073814dd 100644 --- a/tests/unit/loomweave/test_client.py +++ b/tests/unit/loomweave/test_client.py @@ -44,8 +44,12 @@ def test_resolve_signs_and_parses(): assert url == "http://loomweave.example/api/wardline/resolve" assert json.loads(sent_body)["project"] == "proj" expected = sign_request( - "s3cr3t", "POST", "/api/wardline/resolve", sent_body, - timestamp=headers["X-Weft-Timestamp"], nonce=headers["X-Weft-Nonce"], + "s3cr3t", + "POST", + "/api/wardline/resolve", + sent_body, + timestamp=headers["X-Weft-Timestamp"], + nonce=headers["X-Weft-Nonce"], ) assert headers["X-Weft-Component"] == f"loomweave:{expected}" diff --git a/tests/unit/loomweave/test_sei_client_wire.py b/tests/unit/loomweave/test_sei_client_wire.py index 426340d1..60fb8a97 100644 --- a/tests/unit/loomweave/test_sei_client_wire.py +++ b/tests/unit/loomweave/test_sei_client_wire.py @@ -36,8 +36,12 @@ def test_capabilities_gets_route_and_parses() -> None: assert url == "http://loomweave.example/api/v1/_capabilities" # GET routes are signed too (empty body) — the shared _send path signs everything. expected = sign_request( - "s3cr3t", "GET", "/api/v1/_capabilities", sent_body, - timestamp=headers["X-Weft-Timestamp"], nonce=headers["X-Weft-Nonce"], + "s3cr3t", + "GET", + "/api/v1/_capabilities", + sent_body, + timestamp=headers["X-Weft-Timestamp"], + nonce=headers["X-Weft-Nonce"], ) assert headers["X-Weft-Component"] == f"loomweave:{expected}" @@ -65,8 +69,12 @@ def test_resolve_identity_posts_locator_and_signs() -> None: assert url == "http://loomweave.example/api/v1/identity/resolve" assert json.loads(sent_body) == {"locator": "python:function:m.f"} expected = sign_request( - "s3cr3t", "POST", "/api/v1/identity/resolve", sent_body, - timestamp=headers["X-Weft-Timestamp"], nonce=headers["X-Weft-Nonce"], + "s3cr3t", + "POST", + "/api/v1/identity/resolve", + sent_body, + timestamp=headers["X-Weft-Timestamp"], + nonce=headers["X-Weft-Nonce"], ) assert headers["X-Weft-Component"] == f"loomweave:{expected}" diff --git a/tests/unit/mcp/test_server_query_explain.py b/tests/unit/mcp/test_server_query_explain.py index 0eaea606..56624a91 100644 --- a/tests/unit/mcp/test_server_query_explain.py +++ b/tests/unit/mcp/test_server_query_explain.py @@ -22,6 +22,7 @@ def _baseline_all(tmp_path) -> None: bl.parent.mkdir(parents=True, exist_ok=True) write_baseline(bl, defects) + # Two boundaries + two trusted leaks → PY-WL-101 fires on both leaks. _SRC = ( "from wardline.decorators import external_boundary, trusted\n" diff --git a/tests/unit/scanner/taint/test_variable_level.py b/tests/unit/scanner/taint/test_variable_level.py index ecc7c013..c498abe2 100644 --- a/tests/unit/scanner/taint/test_variable_level.py +++ b/tests/unit/scanner/taint/test_variable_level.py @@ -309,13 +309,7 @@ def test_lambda_rebinding_survives_no_else_if_for_post_branch_call() -> None: # (conservative). A clear-then-union merge that let the implicit (no-else) # fall-through arm win last reverted ``cb`` to the safe lambda and dropped the # detection — a false negative the pre-branch-local engine did not have. - src = ( - "def handler(raw):\n" - " cb = lambda c: c\n" - " if flag:\n" - " cb = lambda c: sink(c)\n" - " cb(raw)\n" - ) + src = "def handler(raw):\n cb = lambda c: c\n if flag:\n cb = lambda c: sink(c)\n cb(raw)\n" assert _lambda_body_sink_arg(src) == T.EXTERNAL_RAW From da9c5352f721a02a0ce4406ba7fdb796da00f874 Mon Sep 17 00:00:00 2001 From: John Morrissey Date: Sun, 7 Jun 2026 00:33:25 +1000 Subject: [PATCH 16/17] fix(rc2): close 3 PR-review Important findings (EmitResult invariants, FP guard, doc-anchor rot) Addresses the three Important findings from the PR #32 review panel, each validated with an actual RED->GREEN cycle under debugging discipline. I-1 EmitResult contradictory states (core/filigree_emit.py): - auth_rejected is now a derived @property (status in {401,403}), deleting the redundant axis so "auth-rejected (200)" is unrepresentable, not merely unbuilt. - __post_init__ guard mirrors GateDecision: a reachable/success result carries no error status; a soft-failure created/updated nothing. Rejects reachable+503. - Docstring corrected (status is the error status; None on transport-fail AND 2xx). - No wire change: server.py still serializes auth_rejected via the property. I-2 false-positive guard for PY-WL-110 (test_contradictory_trust.py): - Empirically: a foreign-only marker stack is filtered at the anchoring gate (provenance "fallback"), never reaching the line-81 prefix check. Added both the system-level test and the isolating test (real trust_boundary anchor + a coincidental foreign `trusted`). Mutation-proven: breaking the prefix check makes the isolating test fire a false PY-WL-110. I-3 stale file:line anchors (finding-lifecycle-vocabulary.md): - Re-derived every churned-file anchor from HEAD; corrected ~26 citations. - Added a two-way content-binding discipline test: each load-bearing anchor's token must be on the cited source line AND the doc must cite that line, so doc and code can never silently diverge again. Full suite 2520 passed; ruff/format/mypy clean; mkdocs --strict builds. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../reference/finding-lifecycle-vocabulary.md | 44 ++++++------ src/wardline/core/filigree_emit.py | 29 ++++++-- tests/docs/test_glossary_vocabulary.py | 71 +++++++++++++++++++ tests/unit/cli/test_cli.py | 16 +++-- tests/unit/core/test_filigree_emit.py | 23 ++++++ tests/unit/mcp/test_server_filigree_emit.py | 4 +- .../scanner/rules/test_contradictory_trust.py | 44 ++++++++++++ 7 files changed, 194 insertions(+), 37 deletions(-) diff --git a/docs/reference/finding-lifecycle-vocabulary.md b/docs/reference/finding-lifecycle-vocabulary.md index e2544416..7122fc34 100644 --- a/docs/reference/finding-lifecycle-vocabulary.md +++ b/docs/reference/finding-lifecycle-vocabulary.md @@ -47,7 +47,7 @@ waiver > judged > baseline** — explicit human intent wins, then the LLM verdic **"suppressed"** is the umbrella term for "any state other than `active`": `baselined` + `waived` + `judged`. The CLI prints this sum as the `suppressed` -count (`src/wardline/cli/scan.py:355`), and `to_filigree_metadata` only writes a +count (`src/wardline/cli/scan.py:366`), and `to_filigree_metadata` only writes a `suppressed` key when the state is not `active` (`src/wardline/core/finding.py:184-187`). @@ -59,15 +59,15 @@ consistently, on every surface: | Surface | Where | Term | | --- | --- | --- | | Enum | `src/wardline/core/finding.py:68` | `SuppressionState.ACTIVE = "active"` | -| Summary field | `src/wardline/core/run.py:49`, built at `src/wardline/core/run.py:264` | `ScanSummary.active` | -| CLI summary line | `src/wardline/cli/scan.py:356` | `… {s.active} active` | -| MCP scan response | `src/wardline/mcp/server.py:307` | `summary.active` | -| Agent-summary JSON | `src/wardline/core/agent_summary.py:86` | `summary.active_defects` | +| Summary field | `src/wardline/core/run.py:49`, built at `src/wardline/core/run.py:280` | `ScanSummary.active` | +| CLI summary line | `src/wardline/cli/scan.py:367` | `… {s.active} active` | +| MCP scan response | `src/wardline/mcp/server.py:314` | `summary.active` | +| Agent-summary JSON | `src/wardline/core/agent_summary.py:90` | `summary.active_defects` | | `wardline:loop` prompt | `src/wardline/mcp/prompts.py:13` | "Read `summary.active`" | The agent-summary key is `active_defects` rather than bare `active` — that is a descriptive-suffix convention alongside `total_findings` / `suppressed_findings` -(`src/wardline/core/agent_summary.py:85-92`), not a different concept. It counts +(`src/wardline/core/agent_summary.py:89-96`), not a different concept. It counts the same population. The discipline test `tests/cli/test_scan_summary_vocab.py` pins this: the CLI @@ -83,8 +83,8 @@ still legitimately means three different things depending on the surface: | "new" on this surface | Means | Owner / anchor | | --- | --- | --- | | Filigree store | An **unseen fingerprint** — first time this finding identity is seen for a `(file, scan_source)`. Driven by `mark_unseen` / the absent-fingerprint sweep. | **Filigree-owned** lifecycle (`src/wardline/core/filigree_emit.py:68-76`) | -| `wardline scan --new-since ` | **Delta-scope**: the gate fires only on defects in files/entities changed since a git ref; everything else is re-marked `baselined`. | `src/wardline/core/run.py:240-259`; help text `src/wardline/cli/scan.py` (`--new-since`, "new findings only") | -| (historical) CLI summary | Formerly relabelled the `active` count as "N new". **Corrected to "N active"** so the CLI matches every other surface. | `src/wardline/cli/scan.py:356` | +| `wardline scan --new-since ` | **Delta-scope**: the gate fires only on defects in files/entities changed since a git ref; everything else is re-marked `baselined`. | `src/wardline/core/run.py:256-275`; help text `src/wardline/cli/scan.py` (`--new-since`, "new findings only") | +| (historical) CLI summary | Formerly relabelled the `active` count as "N new". **Corrected to "N active"** so the CLI matches every other surface. | `src/wardline/cli/scan.py:367` | The first-seen Filigree sense and the delta-scope `--new-since` sense are genuinely distinct concepts; neither is "active". An agent should read the CLI / @@ -97,19 +97,19 @@ There are **two distinct populations** of defects in one scan, and they can differ on purpose: 1. **Emitted-active** — `summary.active` counts `active` defects in the - **emitted** (post-annotation) findings (`src/wardline/core/run.py:262-265`). + **emitted** (post-annotation) findings (`src/wardline/core/run.py:277-285`). Baseline / waiver / judged annotate these findings in place; a suppressed defect is still emitted, just not counted as `active`. 2. **Gate population** — the `--fail-on` gate evaluates a **separate** `ScanResult.gate_findings` list: the *unsuppressed* population - (`src/wardline/core/run.py:226-230`). By default, repository-controlled + (`src/wardline/core/run.py:242-246`). By default, repository-controlled baseline / waiver / judged entries **annotate** the emitted findings but do **not** clear the gate — so a malicious PR cannot green the gate by committing a suppression keyed to its own new defect. `gate_decision` evaluates `gate_findings` when present, else falls back to `findings` (the trusted `--trust-suppressions` / directly-constructed path) - (`src/wardline/core/run.py:290-292`). + (`src/wardline/core/run.py:307-308`). This is why **`summary.active: 0` can co-exist with `gate.tripped: true`**: every defect was suppressed by a committed baseline (so emitted-active is 0), but those @@ -119,13 +119,13 @@ carries `tripped` / `fail_on` / `exit_class` **plus** a human `reason` and the `evaluated` population it judged (`src/wardline/core/run.py:82-92`), so the `0 active + tripped` case explains itself instead of reading as a defect. The MCP `scan` block exposes `gate.tripped` / `gate.reason` / `gate.evaluated` / -`gate.migration_hint` (`src/wardline/mcp/server.py:327-332`); the CLI prints +`gate.migration_hint` (`src/wardline/mcp/server.py:333-339`); the CLI prints `gate: FAILED (--fail-on …) — ` then `gate: evaluated <…>` on stderr -(`src/wardline/cli/scan.py:370`). +(`src/wardline/cli/scan.py:381-382`). `--new-since` scopes **both** populations identically: any `active` defect outside the delta is re-marked `baselined` in both the emitted and gate lists -(`src/wardline/core/run.py:240-259`). +(`src/wardline/core/run.py:256-275`). ## Cross-surface mapping table @@ -133,14 +133,14 @@ How each concept appears on each surface: | Concept | CLI summary text | `ScanSummary` field | MCP `summary` key | Agent-summary key | Filigree store | | --- | --- | --- | --- | --- | --- | -| every finding | `N finding(s)` | `total` (`run.py:48`) | `total` (`server.py:306`) | `total_findings` (`agent_summary.py:85`) | one finding per wire entry | -| live defect | `N active` (`scan.py:356`) | `active` (`run.py:49,264`) | `active` (`server.py:307`) | `active_defects` (`agent_summary.py:86`) | no `suppressed` key (`finding.py:184`) | -| suppressed (sum) | `N suppressed` (`scan.py:355`) | `baselined+waived+judged` | the three keys | `suppressed_findings` (`agent_summary.py:87`) | `metadata.wardline.suppressed` (`finding.py:184-187`) | -| baselined | `N baseline` | `baselined` (`run.py:51`) | `baselined` (`server.py:308`) | `baselined` (`agent_summary.py:89`) | `suppressed: "baselined"` | -| waived | `N waiver` | `waived` (`run.py:52`) | `waived` (`server.py:309`) | `waived` (`agent_summary.py:90`) | `suppressed: "waived"` | -| judged | `N judged` | `judged` (`run.py:53`) | `judged` (`server.py:310`) | `judged` (`agent_summary.py:91`) | `suppressed: "judged"` | -| under-scan | `N file(s) could not be analyzed` | `unanalyzed` (`run.py:59`) | `unanalyzed` (`server.py:314`) | `unanalyzed` (`agent_summary.py:92`) | `WLN-ENGINE-*` facts | -| gate verdict | exit code + `--fail-on` | (`gate_findings`, `run.py:78`) | `gate.tripped` (`server.py:327`) | `gate.tripped` (`agent_summary.py:95`) | not emitted to Filigree | +| every finding | `N finding(s)` | `total` (`run.py:48`) | `total` (`server.py:313`) | `total_findings` (`agent_summary.py:89`) | one finding per wire entry | +| live defect | `N active` (`scan.py:367`) | `active` (`run.py:49,280`) | `active` (`server.py:314`) | `active_defects` (`agent_summary.py:90`) | no `suppressed` key (`finding.py:184`) | +| suppressed (sum) | `N suppressed` (`scan.py:366`) | `baselined+waived+judged` | the three keys | `suppressed_findings` (`agent_summary.py:91`) | `metadata.wardline.suppressed` (`finding.py:184-187`) | +| baselined | `N baseline` | `baselined` (`run.py:51`) | `baselined` (`server.py:315`) | `baselined` (`agent_summary.py:93`) | `suppressed: "baselined"` | +| waived | `N waiver` | `waived` (`run.py:52`) | `waived` (`server.py:316`) | `waived` (`agent_summary.py:94`) | `suppressed: "waived"` | +| judged | `N judged` | `judged` (`run.py:53`) | `judged` (`server.py:317`) | `judged` (`agent_summary.py:95`) | `suppressed: "judged"` | +| under-scan | `N file(s) could not be analyzed` | `unanalyzed` (`run.py:59`) | `unanalyzed` (`server.py:321`) | `unanalyzed` (`agent_summary.py:96`) | `WLN-ENGINE-*` facts | +| gate verdict | exit code + `--fail-on` | (`gate_findings`, `run.py:78`) | `gate.tripped` (`server.py:334`) | `gate.tripped` (`agent_summary.py:99`) | not emitted to Filigree | ## For the suite diff --git a/src/wardline/core/filigree_emit.py b/src/wardline/core/filigree_emit.py index b11b9e95..a6be3a97 100644 --- a/src/wardline/core/filigree_emit.py +++ b/src/wardline/core/filigree_emit.py @@ -98,13 +98,28 @@ class EmitResult: failed: int = 0 warnings: tuple[str, ...] = () # Discriminate WHY enrichment was unavailable so the caller can say the actionable - # thing instead of a flat "could not reach" (dogfood #5). ``status`` is the HTTP - # status when one reached us (401/403 auth-refused, 5xx outage) and None when the - # transport itself failed (connection refused / DNS / timeout — genuinely unreachable). - # ``auth_rejected`` is the 401/403 case: present-but-refusing-bearer-auth. All of these - # stay SOFT (reachable=False); only the message differs. + # thing instead of a flat "could not reach" (dogfood #5). ``status`` is the HTTP status + # for the SOFT-failure sub-cases — 401/403 (auth refused) or 5xx (outage) — and None for + # both a transport failure (connection refused / DNS / timeout — genuinely unreachable) + # and a 2xx success. It is the *error* status: a reached/success result carries none. + # All of these stay SOFT (reachable=False); only the message differs. status: int | None = None - auth_rejected: bool = False + + @property + def auth_rejected(self) -> bool: + # The 401/403 case: present-but-refusing-bearer-auth. Derived from ``status`` rather + # than stored as an independent field so the two can never disagree (an + # "auth-rejected (200)" is unrepresentable, not merely unbuilt by the producer). + return self.status in (401, 403) + + def __post_init__(self) -> None: + # Mirror GateDecision's construction-time guard so a second constructor cannot + # express a contradictory outcome: a reached/success result carries no error status, + # and a soft-failure (unreachable) created/updated/failed nothing. + if self.reachable and self.status is not None: + raise ValueError(f"a reachable EmitResult carries no error status (got {self.status})") + if not self.reachable and (self.created or self.updated or self.failed): + raise ValueError("an unreachable EmitResult must have zero created/updated/failed") def filigree_disabled_reason(*, reachable: bool, auth_rejected: bool, status: int | None) -> str | None: @@ -177,7 +192,7 @@ def emit(self, findings: Sequence[Finding], *, scanned_paths: Sequence[str] = () # Filigree is present but its opt-in bearer auth is on and refusing us. Stays # SOFT (enrichment unavailable, never exit-2) — but distinguished as auth so the # caller can say "401 (set WARDLINE_FILIGREE_TOKEN)" instead of "could not reach". - return EmitResult(reachable=False, status=resp.status, auth_rejected=True) + return EmitResult(reachable=False, status=resp.status) if resp.status >= 500: # Server-side outage (5xx) — the sibling is degraded, not a Wardline payload bug. # Treat like absent (warn + continue), carrying the status for an honest message. diff --git a/tests/docs/test_glossary_vocabulary.py b/tests/docs/test_glossary_vocabulary.py index 8288434c..1b76119b 100644 --- a/tests/docs/test_glossary_vocabulary.py +++ b/tests/docs/test_glossary_vocabulary.py @@ -8,6 +8,7 @@ from __future__ import annotations +import re from pathlib import Path from wardline.core.finding import SuppressionState @@ -17,6 +18,53 @@ _MKDOCS = _REPO / "mkdocs.yml" _NAV_PATH = "reference/finding-lifecycle-vocabulary.md" +# The glossary promises "every claim cites a real `file:line`". Line anchors rot silently +# when the cited code moves (an in-range / non-blank check would NOT catch it — the line +# still holds *some* code). So bind the load-bearing navigation anchors to a token that +# must appear on that exact source line. If code moves, this test fails and the source +# line here AND the glossary citation must be updated together. Each tuple is +# ``(repo-relative path, 1-based line, substring required on that line)``. +_ANCHORS: tuple[tuple[str, int, str], ...] = ( + # src/wardline/core/run.py — ScanSummary fields, gate population, delta-scope, gate_decision + ("src/wardline/core/run.py", 48, "total: int"), + ("src/wardline/core/run.py", 49, "active: int"), + ("src/wardline/core/run.py", 51, "baselined: int"), + ("src/wardline/core/run.py", 52, "waived: int"), + ("src/wardline/core/run.py", 53, "judged: int"), + ("src/wardline/core/run.py", 59, "unanalyzed: int"), + ("src/wardline/core/run.py", 78, "gate_findings:"), + ("src/wardline/core/run.py", 82, "class GateDecision"), + ("src/wardline/core/run.py", 246, "Baseline(frozenset())"), + ("src/wardline/core/run.py", 256, "def apply_delta_scope"), + ("src/wardline/core/run.py", 280, "active=sum"), + ("src/wardline/core/run.py", 307, "honors_suppressions"), + # src/wardline/cli/scan.py — CLI summary line + gate stderr + ("src/wardline/cli/scan.py", 366, "suppressed"), + ("src/wardline/cli/scan.py", 367, "{s.active} active"), + ("src/wardline/cli/scan.py", 381, "gate: FAILED"), + # src/wardline/mcp/server.py — MCP scan summary + gate block + ("src/wardline/mcp/server.py", 313, '"total": result.summary.total'), + ("src/wardline/mcp/server.py", 314, '"active": result.summary.active'), + ("src/wardline/mcp/server.py", 315, '"baselined": result.summary.baselined'), + ("src/wardline/mcp/server.py", 316, '"waived": result.summary.waived'), + ("src/wardline/mcp/server.py", 317, '"judged": result.summary.judged'), + ("src/wardline/mcp/server.py", 321, '"unanalyzed": result.summary.unanalyzed'), + ("src/wardline/mcp/server.py", 333, '"gate": {'), + ("src/wardline/mcp/server.py", 334, '"tripped": decision.tripped'), + # src/wardline/core/agent_summary.py — agent-summary JSON keys + ("src/wardline/core/agent_summary.py", 89, '"total_findings"'), + ("src/wardline/core/agent_summary.py", 90, '"active_defects"'), + ("src/wardline/core/agent_summary.py", 91, '"suppressed_findings"'), + ("src/wardline/core/agent_summary.py", 93, '"baselined"'), + ("src/wardline/core/agent_summary.py", 94, '"waived"'), + ("src/wardline/core/agent_summary.py", 95, '"judged"'), + ("src/wardline/core/agent_summary.py", 96, '"unanalyzed"'), + ("src/wardline/core/agent_summary.py", 99, '"tripped": self.gate.tripped'), + # stable-file anchors (lower churn, but locked for free) + ("src/wardline/core/finding.py", 68, 'ACTIVE = "active"'), + ("src/wardline/core/suppression.py", 70, "SuppressionState.BASELINED"), +) + def test_glossary_defines_every_suppression_state() -> None: text = _GLOSSARY.read_text(encoding="utf-8") @@ -27,3 +75,26 @@ def test_glossary_defines_every_suppression_state() -> None: def test_glossary_in_nav() -> None: nav = _MKDOCS.read_text(encoding="utf-8") assert _NAV_PATH in nav, f"{_NAV_PATH} is not wired into the mkdocs nav" + + +def test_glossary_anchors_bind_to_code() -> None: + """Each load-bearing ``file:line`` the glossary cites must point at the right code. + + Two-way lock: (1) the cited source line still contains its anchor token (catches code + that moved out from under the citation), and (2) the glossary actually cites that line + (catches the doc drifting away from the code). Both must hold, so doc + code can never + silently diverge — the exact rot this PR's review found. + """ + text = _GLOSSARY.read_text(encoding="utf-8") + for relpath, line, token in _ANCHORS: + code = (_REPO / relpath).read_text(encoding="utf-8").splitlines() + assert 1 <= line <= len(code), f"{relpath}:{line} is out of range ({len(code)} lines)" + assert token in code[line - 1], ( + f"{relpath}:{line} no longer contains {token!r} (got {code[line - 1]!r}); " + f"update both the source line in _ANCHORS and the glossary citation" + ) + base = relpath.rsplit("/", 1)[-1] + # The glossary cites the basename (`run.py:280`) or a full path, possibly inside a + # comma/dash list (`run.py:49,280` / `run.py:82-92`). Require the line to appear. + cite = re.compile(rf"`(?:[\w./-]+/)?{re.escape(base)}:[\d,\-]*\b{line}\b") + assert cite.search(text), f"glossary no longer cites {base}:{line} (anchor {token!r})" diff --git a/tests/unit/cli/test_cli.py b/tests/unit/cli/test_cli.py index d93cf20b..8080e8d4 100644 --- a/tests/unit/cli/test_cli.py +++ b/tests/unit/cli/test_cli.py @@ -787,7 +787,7 @@ def __init__(self, url, **kw): def emit(self, findings, *, scanned_paths=()): from wardline.core.filigree_emit import EmitResult - return EmitResult(reachable=False, status=401, auth_rejected=True) + return EmitResult(reachable=False, status=401) # auth_rejected derived from status monkeypatch.setattr("wardline.cli.scan.FiligreeEmitter", _AuthRejectedEmitter) out = tmp_path / "f.jsonl" @@ -799,8 +799,12 @@ def emit(self, findings, *, scanned_paths=()): assert "wardline_filigree_token" in low -def _emitter_returning(status, *, auth_rejected): - """A FiligreeEmitter stand-in that always returns a canned soft EmitResult.""" +def _emitter_returning(status): + """A FiligreeEmitter stand-in that always returns a canned soft EmitResult. + + ``auth_rejected`` is derived from ``status`` (401/403), so the caller need only pin the + status the soft path reports. + """ class _E: def __init__(self, url, **kw): @@ -809,7 +813,7 @@ def __init__(self, url, **kw): def emit(self, findings, *, scanned_paths=()): from wardline.core.filigree_emit import EmitResult - return EmitResult(reachable=False, status=status, auth_rejected=auth_rejected) + return EmitResult(reachable=False, status=status) return _E @@ -820,7 +824,7 @@ def test_scan_filigree_403_says_forbidden_not_set_a_token(tmp_path, monkeypatch) proj = tmp_path / "proj" proj.mkdir() _write(proj, "svc.py", _LEAKY) - monkeypatch.setattr("wardline.cli.scan.FiligreeEmitter", _emitter_returning(403, auth_rejected=True)) + monkeypatch.setattr("wardline.cli.scan.FiligreeEmitter", _emitter_returning(403)) out = tmp_path / "f.jsonl" result = CliRunner().invoke(scan, [str(proj), "--output", str(out), "--filigree-url", "http://x"]) assert result.exit_code == 0, result.output @@ -836,7 +840,7 @@ def test_scan_filigree_5xx_says_server_error_not_unreachable(tmp_path, monkeypat proj = tmp_path / "proj" proj.mkdir() _write(proj, "svc.py", _LEAKY) - monkeypatch.setattr("wardline.cli.scan.FiligreeEmitter", _emitter_returning(503, auth_rejected=False)) + monkeypatch.setattr("wardline.cli.scan.FiligreeEmitter", _emitter_returning(503)) out = tmp_path / "f.jsonl" result = CliRunner().invoke(scan, [str(proj), "--output", str(out), "--filigree-url", "http://x"]) assert result.exit_code == 0, result.output diff --git a/tests/unit/core/test_filigree_emit.py b/tests/unit/core/test_filigree_emit.py index c0b573c8..ad8b27a7 100644 --- a/tests/unit/core/test_filigree_emit.py +++ b/tests/unit/core/test_filigree_emit.py @@ -179,6 +179,29 @@ def test_http_5xx_carries_status_but_is_not_auth_rejected() -> None: assert res.auth_rejected is False +def test_emit_result_auth_rejected_is_derived_from_status() -> None: + # ``auth_rejected`` is not an independent axis — it is exactly ``status in (401, 403)``. + # Deriving it makes "auth-rejected (200)" and "auth-rejected with a 5xx" unrepresentable. + assert EmitResult(reachable=False, status=401).auth_rejected is True + assert EmitResult(reachable=False, status=403).auth_rejected is True + assert EmitResult(reachable=False, status=503).auth_rejected is False + assert EmitResult(reachable=False).auth_rejected is False + assert EmitResult(reachable=True, created=1).auth_rejected is False + + +def test_emit_result_rejects_contradictory_states() -> None: + # The redundant ``auth_rejected`` axis is gone: it can no longer be set independently + # (so it can never disagree with ``status``). + with pytest.raises(TypeError): + EmitResult(reachable=False, status=200, auth_rejected=True) # type: ignore[call-arg] + # Mirror GateDecision's construction guard: a reached/success result carries no error + # status, and a soft-failure created/updated nothing. + with pytest.raises(ValueError): + EmitResult(reachable=True, status=503) + with pytest.raises(ValueError): + EmitResult(reachable=False, created=1) + + def test_bearer_token_carried_when_provided() -> None: t = _FakeTransport(response=Response(status=200, body=_ok_body())) FiligreeEmitter("http://x/api/weft/scan-results", transport=t, token="sekret").emit([_f()]) diff --git a/tests/unit/mcp/test_server_filigree_emit.py b/tests/unit/mcp/test_server_filigree_emit.py index b8fdfcee..2eaf8cfc 100644 --- a/tests/unit/mcp/test_server_filigree_emit.py +++ b/tests/unit/mcp/test_server_filigree_emit.py @@ -136,7 +136,7 @@ def test_scan_filigree_401_surfaces_auth_reason_to_agent(tmp_path): # Dogfood #5 (MCP parity): a 401 stays soft but the agent must read an actionable # disabled_reason naming the token, not a flat "unreachable". (tmp_path / "svc.py").write_text(_LEAKY, encoding="utf-8") - out = _scan({}, tmp_path, None, FakeEmitter(EmitResult(reachable=False, status=401, auth_rejected=True))) + out = _scan({}, tmp_path, None, FakeEmitter(EmitResult(reachable=False, status=401))) assert out["filigree"]["reachable"] is False # still soft reason = out["filigree_emit"]["disabled_reason"] assert "401" in reason and "WARDLINE_FILIGREE_TOKEN" in reason @@ -148,7 +148,7 @@ def test_scan_filigree_403_says_forbidden_not_set_a_token(tmp_path): # (the token is present and lacks access / is blocked). The reason must say forbidden, # not point at the env var. (tmp_path / "svc.py").write_text(_LEAKY, encoding="utf-8") - out = _scan({}, tmp_path, None, FakeEmitter(EmitResult(reachable=False, status=403, auth_rejected=True))) + out = _scan({}, tmp_path, None, FakeEmitter(EmitResult(reachable=False, status=403))) assert out["filigree"]["reachable"] is False # still soft reason = out["filigree_emit"]["disabled_reason"] assert "403" in reason and "forbidden" in reason diff --git a/tests/unit/scanner/rules/test_contradictory_trust.py b/tests/unit/scanner/rules/test_contradictory_trust.py index 6de42e71..872dc26f 100644 --- a/tests/unit/scanner/rules/test_contradictory_trust.py +++ b/tests/unit/scanner/rules/test_contradictory_trust.py @@ -140,6 +140,50 @@ def conflicting(p): assert [(f.rule_id, f.qualname) for f in findings] == [("PY-WL-110", "m.conflicting")] +def test_user_own_trust_named_decorators_do_not_fire(tmp_path) -> None: + # A user's OWN @trusted / @external_boundary imported from a NON-grammar module must + # not be mistaken for the builtin trust vocabulary. Here the engine never anchors the + # entity (provenance source = "fallback", not "anchored"), so the rule's opt-in gate + # (prov.source == "anchored") filters it before marker-counting. This guards the + # system-level behaviour: foreign trust-named decorators don't trip PY-WL-110 at all. + ctx = _analyze( + tmp_path, + """ + from myapp.security import trusted, external_boundary + @trusted + @external_boundary + def f(p): + return p + """, + ) + assert _run(ctx) == [] + + +def test_anchored_entity_ignores_foreign_module_marker(tmp_path) -> None: + # The isolating guard for the `_MARKER_MODULE_PREFIXES` check (contradictory_trust.py + # line ~81). This entity DOES anchor (via the real `wardline.decorators.trust_boundary` + # validator), so it passes the prov.source=="anchored" gate — but the coincidentally + # named `myapp.security.trusted` must NOT be counted as a second marker, because its + # module prefix is not in the grammar. Only `trust_boundary` counts, so len(markers) < 2 + # and nothing fires. If the prefix check regressed (keying on the bare name), the foreign + # `trusted` would be counted, yielding a FALSE PY-WL-110 on legitimate user code. + # (Verified empirically: without this guard the foreign marker is counted and it fires.) + ctx = _analyze( + tmp_path, + """ + from wardline.decorators import trust_boundary + from myapp.security import trusted + @trust_boundary(to_level='ASSURED') + @trusted + def f(p): + if not p: + raise ValueError + return p + """, + ) + assert _run(ctx) == [] + + def test_weft_markers_call_form_fires(tmp_path) -> None: # The called form (@trusted(level=...) + @external_boundary) over weft_markers. ctx = _analyze( From 58ab20f9cabd89e8a6953e8f154b51bff932197c Mon Sep 17 00:00:00 2001 From: John Morrissey Date: Sun, 7 Jun 2026 00:54:21 +1000 Subject: [PATCH 17/17] chore(release): bump version to 1.0.0rc3 Co-Authored-By: Claude Opus 4.8 (1M context) --- src/wardline/_version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/wardline/_version.py b/src/wardline/_version.py index 5a662149..5b243aad 100644 --- a/src/wardline/_version.py +++ b/src/wardline/_version.py @@ -1 +1 @@ -__version__ = "1.0.0rc2" +__version__ = "1.0.0rc3"