diff --git a/app/rendering/pipeline.py b/app/rendering/pipeline.py index 9b3a4f9..fc0c6b8 100644 --- a/app/rendering/pipeline.py +++ b/app/rendering/pipeline.py @@ -1,16 +1,47 @@ """Orchestrate the full report → PDF pipeline.""" from __future__ import annotations +import html +import re from html.parser import HTMLParser from pathlib import Path from ..reporting import ReportTemplate +from ..reporting.evidence import collect_evidence from .chromium import render_to_html from .resources import build from .weasyprint import render_to_pdf BUNDLE = Path(__file__).parents[2] / "packages" / "rendering" / "dist" / "bundle.js" +# Matches Ghostwriter's old-dot-var syntax: {{.name}}, {{.ref name}}, {{.caption name}} +_GW_TAG_RE = re.compile(r"\{\{\s*\.([^\{\}]*?)\s*\}\}") + +# Matches Ghostwriter's newer TinyMCE richtext evidence div: +#
+# Attribute order may vary; this handles both orderings. +_GW_RICHTEXT_RE = re.compile( + r']*\bclass="richtext-evidence"[^>]*\bdata-evidence-id="(\d+)"[^>]*>\s*' + r'|' + r']*\bdata-evidence-id="(\d+)"[^>]*\bclass="richtext-evidence"[^>]*>\s*' +) + +# Rich-text finding fields that Ghostwriter allows inline evidence in. +# "title" is intentionally excluded — it is plain text, never richtext. +# report.extra_fields are also excluded; Ghostwriter does not support inline +# evidence there, so we leave those fields untouched. +_FINDING_TEXT_FIELDS = ( + "affected_entities", + "description", + "impact", + "mitigation", + "recommendation", + "replication_steps", + "host_detection_techniques", + "network_detection_techniques", + "references", +) + _SEVERITY: dict[str, tuple[int, str]] = { "critical": (1, "critical"), "high": (2, "high"), @@ -20,6 +51,105 @@ "info": (5, "info"), } +def _build_evidence_index(report_json: dict) -> tuple[dict[str, dict], dict[int, dict]]: + """Build two evidence lookups from the report JSON: + by_name: friendly_name → evidence object (for {{.name}} tags) + by_id: numeric id → evidence object (for richtext-evidence divs) + """ + by_name: dict[str, dict] = {} + by_id: dict[int, dict] = {} + for ev in collect_evidence(report_json): + fn = ev.get("friendly_name") + if isinstance(fn, str) and fn: + by_name[fn] = ev + by_id[ev["id"]] = ev + return by_name, by_id + + +def _resolve_inline_evidence(text: str, ev_index: dict[str, dict]) -> tuple[str, set[int]]: + """Replace Ghostwriter inline evidence tags in an HTML field. + + Ghostwriter stores two tag forms in rich-text fields that are shipped as-is + in the generateReport JSON (Jinja2 is not applied for JSON export): + + {{.friendly_name}} → inline evidence image + {{.ref friendly_name}} → text reference (friendly name / caption) + {{.caption friendly_name}} → caption label for the figure + + Returns (resolved_text, set of evidence IDs that were embedded as images). + """ + if not ev_index or not text or "{{" not in text: + return text, set() + + used_ids: set[int] = set() + + def _replace(m: re.Match) -> str: + contents = m.group(1).strip() + + if contents.startswith("ref "): + name = contents[4:].strip() + ev = ev_index.get(name) + if ev: + return html.escape(ev.get("caption") or ev.get("friendly_name") or name) + return html.escape(name) + + if contents.startswith("caption "): + name = contents[8:].strip() + ev = ev_index.get(name) + if ev: + return html.escape(ev.get("caption") or ev.get("friendly_name") or name) + return html.escape(name) + + # Plain {{.name}} → inline evidence image wrapped in figure/figcaption + name = contents + ev = ev_index.get(name) + if ev and ev.get("path"): + used_ids.add(ev["id"]) + caption = html.escape(ev.get("caption") or ev.get("friendly_name") or name) + path = html.escape(ev["path"]) + return ( + f'
' + f'{caption}' + f'
{caption}
' + f'
' + ) + return m.group(0) # unknown name — leave unchanged + + return _GW_TAG_RE.sub(_replace, text), used_ids + + +def _resolve_richtext_evidence(text: str, ev_by_id: dict[int, dict]) -> tuple[str, set[int]]: + """Replace Ghostwriter richtext-evidence divs with figure/img/figcaption. + + Newer Ghostwriter TinyMCE versions store inline evidence as: +
+ instead of the older {{.friendly_name}} tag syntax. + + Returns (resolved_text, set of evidence IDs that were embedded as images). + """ + if not ev_by_id or not text or 'richtext-evidence' not in text: + return text, set() + + used_ids: set[int] = set() + + def _replace(m: re.Match) -> str: + eid_str = m.group(1) or m.group(2) + ev = ev_by_id.get(int(eid_str)) + if ev and ev.get("path"): + used_ids.add(ev["id"]) + caption = html.escape(ev.get("caption") or ev.get("friendly_name") or eid_str) + path = html.escape(ev["path"]) + return ( + f'
' + f'{caption}' + f'
{caption}
' + f'
' + ) + return m.group(0) # unknown id — leave unchanged + + return _GW_RICHTEXT_RE.sub(_replace, text), used_ids + + class _TextExtractor(HTMLParser): def __init__(self): super().__init__() @@ -61,6 +191,8 @@ def make_vue_data(raw: dict) -> dict: normalised to None so templates can use a simple truthiness check to conditionally render optional sections. """ + ev_by_name, ev_by_id = _build_evidence_index(raw) + findings = [] for f in raw.get("findings") or []: f = dict(f) @@ -72,6 +204,22 @@ def make_vue_data(raw: dict) -> dict: "score": float(f.get("cvss_score") or 0), "vector": f.get("cvss_vector") or "n/a", } + if ev_by_name or ev_by_id: + inline_ids: set[int] = set() + for field in _FINDING_TEXT_FIELDS: + raw_val = f.get(field) + if not isinstance(raw_val, str): + continue + resolved = raw_val + if "{{" in resolved: + resolved, used = _resolve_inline_evidence(resolved, ev_by_name) + inline_ids |= used + if "richtext-evidence" in resolved: + resolved, used = _resolve_richtext_evidence(resolved, ev_by_id) + inline_ids |= used + f[field] = resolved + if inline_ids and isinstance(f.get("evidence"), list): + f["evidence"] = [ev for ev in f["evidence"] if ev.get("id") not in inline_ids] findings.append(f) report = dict(raw) @@ -107,5 +255,5 @@ def render_report( bundle_js = BUNDLE.read_text("utf-8") resources = build(template, report_json) - html = render_to_html(data, template_html, css, bundle_js, language, resources) - return render_to_pdf(html, resources) + rendered_html = render_to_html(data, template_html, css, bundle_js, language, resources) + return render_to_pdf(rendered_html, resources) diff --git a/app/reporting/evidence.py b/app/reporting/evidence.py index 79b0043..e24d46f 100644 --- a/app/reporting/evidence.py +++ b/app/reporting/evidence.py @@ -18,23 +18,32 @@ def local_path(evidence_path: str) -> Path: return _EVIDENCE_DIR / Path(evidence_path).relative_to("evidence") -def collect_paths(obj: object) -> dict[str, int]: +def collect_evidence(obj: object) -> list[dict]: """Recursively find all evidence objects in the report JSON. - Returns a mapping of path -> evidence_id, e.g. {"evidence/2/foo.png": 3}. + An evidence object is any dict with a 'path' starting with 'evidence/' + and an integer 'id'. """ - paths: dict[str, int] = {} + found: list[dict] = [] if isinstance(obj, dict): p = obj.get("path") eid = obj.get("id") if isinstance(p, str) and p.startswith("evidence/") and isinstance(eid, int): - paths[p] = eid + found.append(obj) for v in obj.values(): - paths |= collect_paths(v) + found.extend(collect_evidence(v)) elif isinstance(obj, list): for item in obj: - paths |= collect_paths(item) - return paths + found.extend(collect_evidence(item)) + return found + + +def collect_paths(obj: object) -> dict[str, int]: + """Recursively find all evidence objects in the report JSON. + + Returns a mapping of path -> evidence_id, e.g. {"evidence/2/foo.png": 3}. + """ + return {ev["path"]: ev["id"] for ev in collect_evidence(obj)} def _fetch_and_save(client: GhostwriterClient, evidence_id: int, path: str, media_path: Path | None) -> tuple[str, bool]: diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py new file mode 100644 index 0000000..9777643 --- /dev/null +++ b/tests/test_pipeline.py @@ -0,0 +1,208 @@ +import pytest + +from app.rendering.pipeline import ( + _build_evidence_index, + _resolve_inline_evidence, + _resolve_richtext_evidence, + make_vue_data, +) + +_EV1 = { + "id": 1, + "path": "evidence/1/shot.png", + "friendly_name": "login_page", + "caption": "Login page screenshot", +} +_EV2 = { + "id": 2, + "path": "evidence/2/admin.png", + "friendly_name": "admin_panel", + "caption": None, +} +_REPORT = {"findings": [{"evidence": [_EV1, _EV2]}]} + + +class TestBuildEvidenceIndex: + def test_indexes_by_name_and_id(self): + by_name, by_id = _build_evidence_index(_REPORT) + assert by_name["login_page"] is _EV1 + assert by_id[1] is _EV1 + assert by_name["admin_panel"] is _EV2 + assert by_id[2] is _EV2 + + def test_empty_report_returns_empty_indexes(self): + by_name, by_id = _build_evidence_index({}) + assert by_name == {} + assert by_id == {} + + def test_evidence_without_friendly_name_indexed_by_id_only(self): + ev = {"id": 3, "path": "evidence/3/x.png"} + by_name, by_id = _build_evidence_index({"ev": ev}) + assert 3 in by_id + assert by_name == {} + + +class TestResolveInlineEvidence: + @pytest.fixture(autouse=True) + def index(self): + self.idx = {"login_page": _EV1, "admin_panel": _EV2} + + def test_plain_name_produces_figure(self): + result, ids = _resolve_inline_evidence("{{.login_page}}", self.idx) + assert 'Login page screenshot" in result + assert ids == {1} + + def test_plain_name_with_whitespace(self): + result, ids = _resolve_inline_evidence("{{. login_page }}", self.idx) + assert 'xss"} + result, ids = _resolve_inline_evidence("{{.x}}", {"x": ev}) + assert ""} + div = '
' + result, ids = _resolve_richtext_evidence(div, {5: ev}) + assert "