From 68f6c2d807ff8097acdb60c78f62ee435ccc9377 Mon Sep 17 00:00:00 2001 From: TheGr3atJosh <90441217+TheGr3atJosh@users.noreply.github.com> Date: Sat, 9 May 2026 21:55:06 +0200 Subject: [PATCH 1/6] feat: resolve Ghostwriter richtext-evidence divs to inline images Ghostwriter's TinyMCE stores inline evidence as:
The previous resolver only handled the older {{.friendly_name}} tag syntax, which is not emitted by current Ghostwriter versions. This caused inline evidence to silently disappear from rendered PDFs. Changes: - _build_evidence_index now returns two indexes: by friendly_name and by numeric id, built in a single JSON walk - _resolve_richtext_evidence replaces richtext-evidence divs with
using the id-based index - Both resolvers run sequentially per field in make_vue_data; the {{.name}} resolver is kept for backwards compatibility Co-Authored-By: Claude Sonnet 4.6 --- app/rendering/pipeline.py | 151 +++++++++++++++++++++++++++++++++++++- 1 file changed, 149 insertions(+), 2 deletions(-) diff --git a/app/rendering/pipeline.py b/app/rendering/pipeline.py index 9b3a4f9..edadda8 100644 --- a/app/rendering/pipeline.py +++ b/app/rendering/pipeline.py @@ -1,6 +1,8 @@ """Orchestrate the full report → PDF pipeline.""" from __future__ import annotations +import html +import re from html.parser import HTMLParser from pathlib import Path @@ -11,6 +13,32 @@ BUNDLE = Path(__file__).parents[2] / "packages" / "rendering" / "dist" / "bundle.js" +# Matches Ghostwriter's old-dot-var syntax: {{.name}}, {{.ref name}}, {{.caption name}} +_GW_TAG_RE = re.compile(r"\{\{\s*\.([^\{\}]*?)\s*\}\}") + +# Matches Ghostwriter's newer TinyMCE richtext evidence div: +#
+# Attribute order may vary; this handles both orderings. +_GW_RICHTEXT_RE = re.compile( + r']*\bclass="richtext-evidence"[^>]*\bdata-evidence-id="(\d+)"[^>]*>\s*' + r'|' + r']*\bdata-evidence-id="(\d+)"[^>]*\bclass="richtext-evidence"[^>]*>\s*' +) + +# Finding text fields that can contain inline evidence references (mirrors Ghostwriter's allowlist) +_FINDING_TEXT_FIELDS = ( + "title", + "affected_entities", + "description", + "impact", + "mitigation", + "recommendation", + "replication_steps", + "host_detection_techniques", + "network_detection_techniques", + "references", +) + _SEVERITY: dict[str, tuple[int, str]] = { "critical": (1, "critical"), "high": (2, "high"), @@ -20,6 +48,112 @@ "info": (5, "info"), } +def _build_evidence_index(report_json: dict) -> tuple[dict[str, dict], dict[int, dict]]: + """Walk the report JSON and return two evidence lookups: + by_name: friendly_name → evidence object (for {{.name}} tags) + by_id: numeric id → evidence object (for richtext-evidence divs) + """ + by_name: dict[str, dict] = {} + by_id: dict[int, dict] = {} + + def _walk(obj: object) -> None: + if isinstance(obj, dict): + p = obj.get("path") + fn = obj.get("friendly_name") + eid = obj.get("id") + if ( + isinstance(p, str) and p.startswith("evidence/") + and isinstance(eid, int) + and isinstance(fn, str) and fn + ): + by_name[fn] = obj + by_id[eid] = obj + for v in obj.values(): + _walk(v) + elif isinstance(obj, list): + for item in obj: + _walk(item) + + _walk(report_json) + return by_name, by_id + + +def _resolve_inline_evidence(text: str, ev_index: dict[str, dict]) -> str: + """Replace Ghostwriter inline evidence tags in an HTML field. + + Ghostwriter stores two tag forms in rich-text fields that are shipped as-is + in the generateReport JSON (Jinja2 is not applied for JSON export): + + {{.friendly_name}} → inline evidence image + {{.ref friendly_name}} → text reference (friendly name / caption) + {{.caption friendly_name}} → caption label for the figure + + We replace them with HTML that the Vue renderer can handle. + """ + if not ev_index or not text or "{{" not in text: + return text + + def _replace(m: re.Match) -> str: + contents = m.group(1).strip() + + if contents.startswith("ref "): + name = contents[4:].strip() + ev = ev_index.get(name) + if ev: + return html.escape(ev.get("caption") or ev.get("friendly_name") or name) + return html.escape(name) + + if contents.startswith("caption "): + name = contents[8:].strip() + ev = ev_index.get(name) + if ev: + return html.escape(ev.get("caption") or ev.get("friendly_name") or name) + return html.escape(name) + + # Plain {{.name}} → inline evidence image wrapped in figure/figcaption + name = contents + ev = ev_index.get(name) + if ev and ev.get("path"): + caption = html.escape(ev.get("caption") or ev.get("friendly_name") or name) + path = ev["path"] + return ( + f'
' + f'{caption}' + f'
{caption}
' + f'
' + ) + return m.group(0) # unknown name — leave unchanged + + return _GW_TAG_RE.sub(_replace, text) + + +def _resolve_richtext_evidence(text: str, ev_by_id: dict[int, dict]) -> str: + """Replace Ghostwriter richtext-evidence divs with figure/img/figcaption. + + Newer Ghostwriter TinyMCE versions store inline evidence as: +
+ instead of the older {{.friendly_name}} tag syntax. + """ + if not ev_by_id or not text or 'richtext-evidence' not in text: + return text + + def _replace(m: re.Match) -> str: + eid_str = m.group(1) or m.group(2) + ev = ev_by_id.get(int(eid_str)) + if ev and ev.get("path"): + caption = html.escape(ev.get("caption") or ev.get("friendly_name") or eid_str) + path = ev["path"] + return ( + f'
' + f'{caption}' + f'
{caption}
' + f'
' + ) + return m.group(0) # unknown id — leave unchanged + + return _GW_RICHTEXT_RE.sub(_replace, text) + + class _TextExtractor(HTMLParser): def __init__(self): super().__init__() @@ -61,6 +195,8 @@ def make_vue_data(raw: dict) -> dict: normalised to None so templates can use a simple truthiness check to conditionally render optional sections. """ + ev_by_name, ev_by_id = _build_evidence_index(raw) + findings = [] for f in raw.get("findings") or []: f = dict(f) @@ -72,6 +208,17 @@ def make_vue_data(raw: dict) -> dict: "score": float(f.get("cvss_score") or 0), "vector": f.get("cvss_vector") or "n/a", } + if ev_by_name or ev_by_id: + for field in _FINDING_TEXT_FIELDS: + raw_val = f.get(field) + if not isinstance(raw_val, str): + continue + resolved = raw_val + if "{{" in resolved: + resolved = _resolve_inline_evidence(resolved, ev_by_name) + if "richtext-evidence" in resolved: + resolved = _resolve_richtext_evidence(resolved, ev_by_id) + f[field] = resolved findings.append(f) report = dict(raw) @@ -107,5 +254,5 @@ def render_report( bundle_js = BUNDLE.read_text("utf-8") resources = build(template, report_json) - html = render_to_html(data, template_html, css, bundle_js, language, resources) - return render_to_pdf(html, resources) + rendered_html = render_to_html(data, template_html, css, bundle_js, language, resources) + return render_to_pdf(rendered_html, resources) From 0588fa77c31bf7a34a1f7ccfe98af87ff554e903 Mon Sep 17 00:00:00 2001 From: onur <67955086+otuva@users.noreply.github.com> Date: Tue, 19 May 2026 10:16:09 +0300 Subject: [PATCH 2/6] fix: escape evidence path in img src to prevent attribute injection --- app/rendering/pipeline.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/app/rendering/pipeline.py b/app/rendering/pipeline.py index edadda8..463b8e2 100644 --- a/app/rendering/pipeline.py +++ b/app/rendering/pipeline.py @@ -115,7 +115,7 @@ def _replace(m: re.Match) -> str: ev = ev_index.get(name) if ev and ev.get("path"): caption = html.escape(ev.get("caption") or ev.get("friendly_name") or name) - path = ev["path"] + path = html.escape(ev["path"]) return ( f'
' f'{caption}' @@ -142,7 +142,7 @@ def _replace(m: re.Match) -> str: ev = ev_by_id.get(int(eid_str)) if ev and ev.get("path"): caption = html.escape(ev.get("caption") or ev.get("friendly_name") or eid_str) - path = ev["path"] + path = html.escape(ev["path"]) return ( f'
' f'{caption}' From b1089e9e1e6744f43bdee1a0b2be2ebadad0eddb Mon Sep 17 00:00:00 2001 From: onur <67955086+otuva@users.noreply.github.com> Date: Tue, 19 May 2026 10:16:39 +0300 Subject: [PATCH 3/6] refactor: consolidate evidence JSON walking into collect_evidence _build_evidence_index was duplicating the same recursive walk as collect_paths in evidence.py. Extract collect_evidence() as the shared primitive; both callers now iterate its results instead of re-implementing the tree walk. --- app/rendering/pipeline.py | 28 +++++++--------------------- app/reporting/evidence.py | 23 ++++++++++++++++------- 2 files changed, 23 insertions(+), 28 deletions(-) diff --git a/app/rendering/pipeline.py b/app/rendering/pipeline.py index 463b8e2..ef4802b 100644 --- a/app/rendering/pipeline.py +++ b/app/rendering/pipeline.py @@ -7,6 +7,7 @@ from pathlib import Path from ..reporting import ReportTemplate +from ..reporting.evidence import collect_evidence from .chromium import render_to_html from .resources import build from .weasyprint import render_to_pdf @@ -49,32 +50,17 @@ } def _build_evidence_index(report_json: dict) -> tuple[dict[str, dict], dict[int, dict]]: - """Walk the report JSON and return two evidence lookups: + """Build two evidence lookups from the report JSON: by_name: friendly_name → evidence object (for {{.name}} tags) by_id: numeric id → evidence object (for richtext-evidence divs) """ by_name: dict[str, dict] = {} by_id: dict[int, dict] = {} - - def _walk(obj: object) -> None: - if isinstance(obj, dict): - p = obj.get("path") - fn = obj.get("friendly_name") - eid = obj.get("id") - if ( - isinstance(p, str) and p.startswith("evidence/") - and isinstance(eid, int) - and isinstance(fn, str) and fn - ): - by_name[fn] = obj - by_id[eid] = obj - for v in obj.values(): - _walk(v) - elif isinstance(obj, list): - for item in obj: - _walk(item) - - _walk(report_json) + for ev in collect_evidence(report_json): + fn = ev.get("friendly_name") + if isinstance(fn, str) and fn: + by_name[fn] = ev + by_id[ev["id"]] = ev return by_name, by_id diff --git a/app/reporting/evidence.py b/app/reporting/evidence.py index 79b0043..e24d46f 100644 --- a/app/reporting/evidence.py +++ b/app/reporting/evidence.py @@ -18,23 +18,32 @@ def local_path(evidence_path: str) -> Path: return _EVIDENCE_DIR / Path(evidence_path).relative_to("evidence") -def collect_paths(obj: object) -> dict[str, int]: +def collect_evidence(obj: object) -> list[dict]: """Recursively find all evidence objects in the report JSON. - Returns a mapping of path -> evidence_id, e.g. {"evidence/2/foo.png": 3}. + An evidence object is any dict with a 'path' starting with 'evidence/' + and an integer 'id'. """ - paths: dict[str, int] = {} + found: list[dict] = [] if isinstance(obj, dict): p = obj.get("path") eid = obj.get("id") if isinstance(p, str) and p.startswith("evidence/") and isinstance(eid, int): - paths[p] = eid + found.append(obj) for v in obj.values(): - paths |= collect_paths(v) + found.extend(collect_evidence(v)) elif isinstance(obj, list): for item in obj: - paths |= collect_paths(item) - return paths + found.extend(collect_evidence(item)) + return found + + +def collect_paths(obj: object) -> dict[str, int]: + """Recursively find all evidence objects in the report JSON. + + Returns a mapping of path -> evidence_id, e.g. {"evidence/2/foo.png": 3}. + """ + return {ev["path"]: ev["id"] for ev in collect_evidence(obj)} def _fetch_and_save(client: GhostwriterClient, evidence_id: int, path: str, media_path: Path | None) -> tuple[str, bool]: From 0ee6af39c9b00ed84fe9a7e189b880751fb4154c Mon Sep 17 00:00:00 2001 From: onur <67955086+otuva@users.noreply.github.com> Date: Tue, 19 May 2026 10:16:54 +0300 Subject: [PATCH 4/6] fix: remove title from _FINDING_TEXT_FIELDS; document extra_fields exclusion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit title is plain text in Ghostwriter — inline evidence tags are never stored there, and resolving them would silently mangle any finding name that happens to contain {{...}} syntax. Added comments to explain why both title and extra_fields are excluded from evidence resolution. --- app/rendering/pipeline.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/app/rendering/pipeline.py b/app/rendering/pipeline.py index ef4802b..7407431 100644 --- a/app/rendering/pipeline.py +++ b/app/rendering/pipeline.py @@ -26,9 +26,11 @@ r']*\bdata-evidence-id="(\d+)"[^>]*\bclass="richtext-evidence"[^>]*>\s*' ) -# Finding text fields that can contain inline evidence references (mirrors Ghostwriter's allowlist) +# Rich-text finding fields that Ghostwriter allows inline evidence in. +# "title" is intentionally excluded — it is plain text, never richtext. +# report.extra_fields are also excluded; Ghostwriter does not support inline +# evidence there, so we leave those fields untouched. _FINDING_TEXT_FIELDS = ( - "title", "affected_entities", "description", "impact", From a2cad351ad678de720e7748520acc016900bee96 Mon Sep 17 00:00:00 2001 From: onur <67955086+otuva@users.noreply.github.com> Date: Tue, 19 May 2026 10:17:26 +0300 Subject: [PATCH 5/6] test: add unit tests for evidence index and resolution functions Covers _build_evidence_index, _resolve_inline_evidence, and _resolve_richtext_evidence: happy paths, both attribute orderings for the richtext div regex, caption/friendly_name fallback, unknown name/id passthrough, and HTML escaping of both path and caption. --- tests/test_pipeline.py | 138 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 138 insertions(+) create mode 100644 tests/test_pipeline.py diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py new file mode 100644 index 0000000..5a3f9d3 --- /dev/null +++ b/tests/test_pipeline.py @@ -0,0 +1,138 @@ +import pytest + +from app.rendering.pipeline import ( + _build_evidence_index, + _resolve_inline_evidence, + _resolve_richtext_evidence, +) + +_EV1 = { + "id": 1, + "path": "evidence/1/shot.png", + "friendly_name": "login_page", + "caption": "Login page screenshot", +} +_EV2 = { + "id": 2, + "path": "evidence/2/admin.png", + "friendly_name": "admin_panel", + "caption": None, +} +_REPORT = {"findings": [{"evidence": [_EV1, _EV2]}]} + + +class TestBuildEvidenceIndex: + def test_indexes_by_name_and_id(self): + by_name, by_id = _build_evidence_index(_REPORT) + assert by_name["login_page"] is _EV1 + assert by_id[1] is _EV1 + assert by_name["admin_panel"] is _EV2 + assert by_id[2] is _EV2 + + def test_empty_report_returns_empty_indexes(self): + by_name, by_id = _build_evidence_index({}) + assert by_name == {} + assert by_id == {} + + def test_evidence_without_friendly_name_indexed_by_id_only(self): + ev = {"id": 3, "path": "evidence/3/x.png"} + by_name, by_id = _build_evidence_index({"ev": ev}) + assert 3 in by_id + assert by_name == {} + + +class TestResolveInlineEvidence: + @pytest.fixture(autouse=True) + def index(self): + self.idx = {"login_page": _EV1, "admin_panel": _EV2} + + def test_plain_name_produces_figure(self): + result = _resolve_inline_evidence("{{.login_page}}", self.idx) + assert 'Login page screenshot
" in result + + def test_plain_name_with_whitespace(self): + result = _resolve_inline_evidence("{{. login_page }}", self.idx) + assert 'xss"} + result = _resolve_inline_evidence("{{.x}}", {"x": ev}) + assert ""} + div = '
' + result = _resolve_richtext_evidence(div, {5: ev}) + assert ""} - result = _resolve_inline_evidence("{{.x}}", {"x": ev}) + result, ids = _resolve_inline_evidence("{{.x}}", {"x": ev}) assert ""} div = '
' - result = _resolve_richtext_evidence(div, {5: ev}) + result, ids = _resolve_richtext_evidence(div, {5: ev}) assert "