Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
152 changes: 150 additions & 2 deletions app/rendering/pipeline.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,47 @@
"""Orchestrate the full report → PDF pipeline."""
from __future__ import annotations

import html
import re
from html.parser import HTMLParser
from pathlib import Path

from ..reporting import ReportTemplate
from ..reporting.evidence import collect_evidence
from .chromium import render_to_html
from .resources import build
from .weasyprint import render_to_pdf

BUNDLE = Path(__file__).parents[2] / "packages" / "rendering" / "dist" / "bundle.js"

# Matches Ghostwriter's old-dot-var syntax: {{.name}}, {{.ref name}}, {{.caption name}}
_GW_TAG_RE = re.compile(r"\{\{\s*\.([^\{\}]*?)\s*\}\}")

# Matches Ghostwriter's newer TinyMCE richtext evidence div:
# <div class="richtext-evidence" data-evidence-id="4"></div>
# Attribute order may vary; this handles both orderings.
_GW_RICHTEXT_RE = re.compile(
r'<div\b[^>]*\bclass="richtext-evidence"[^>]*\bdata-evidence-id="(\d+)"[^>]*>\s*</div>'
r'|'
r'<div\b[^>]*\bdata-evidence-id="(\d+)"[^>]*\bclass="richtext-evidence"[^>]*>\s*</div>'
)

# Rich-text finding fields that Ghostwriter allows inline evidence in.
# "title" is intentionally excluded — it is plain text, never richtext.
# report.extra_fields are also excluded; Ghostwriter does not support inline
# evidence there, so we leave those fields untouched.
_FINDING_TEXT_FIELDS = (
"affected_entities",
"description",
"impact",
"mitigation",
"recommendation",
"replication_steps",
"host_detection_techniques",
"network_detection_techniques",
"references",
)

_SEVERITY: dict[str, tuple[int, str]] = {
"critical": (1, "critical"),
"high": (2, "high"),
Expand All @@ -20,6 +51,105 @@
"info": (5, "info"),
}

def _build_evidence_index(report_json: dict) -> tuple[dict[str, dict], dict[int, dict]]:
"""Build two evidence lookups from the report JSON:
by_name: friendly_name → evidence object (for {{.name}} tags)
by_id: numeric id → evidence object (for richtext-evidence divs)
"""
by_name: dict[str, dict] = {}
by_id: dict[int, dict] = {}
for ev in collect_evidence(report_json):
fn = ev.get("friendly_name")
if isinstance(fn, str) and fn:
by_name[fn] = ev
by_id[ev["id"]] = ev
return by_name, by_id


def _resolve_inline_evidence(text: str, ev_index: dict[str, dict]) -> tuple[str, set[int]]:
"""Replace Ghostwriter inline evidence tags in an HTML field.

Ghostwriter stores two tag forms in rich-text fields that are shipped as-is
in the generateReport JSON (Jinja2 is not applied for JSON export):

{{.friendly_name}} → inline evidence image
{{.ref friendly_name}} → text reference (friendly name / caption)
{{.caption friendly_name}} → caption label for the figure

Returns (resolved_text, set of evidence IDs that were embedded as images).
"""
if not ev_index or not text or "{{" not in text:
return text, set()

used_ids: set[int] = set()

def _replace(m: re.Match) -> str:
contents = m.group(1).strip()

if contents.startswith("ref "):
name = contents[4:].strip()
ev = ev_index.get(name)
if ev:
return html.escape(ev.get("caption") or ev.get("friendly_name") or name)
return html.escape(name)

if contents.startswith("caption "):
name = contents[8:].strip()
ev = ev_index.get(name)
if ev:
return html.escape(ev.get("caption") or ev.get("friendly_name") or name)
return html.escape(name)

# Plain {{.name}} → inline evidence image wrapped in figure/figcaption
name = contents
ev = ev_index.get(name)
if ev and ev.get("path"):
used_ids.add(ev["id"])
caption = html.escape(ev.get("caption") or ev.get("friendly_name") or name)
path = html.escape(ev["path"])
return (
f'<figure>'
f'<img src="{path}" alt="{caption}" style="max-width:100%">'
f'<figcaption>{caption}</figcaption>'
f'</figure>'
)
return m.group(0) # unknown name — leave unchanged

return _GW_TAG_RE.sub(_replace, text), used_ids


def _resolve_richtext_evidence(text: str, ev_by_id: dict[int, dict]) -> tuple[str, set[int]]:
"""Replace Ghostwriter richtext-evidence divs with figure/img/figcaption.

Newer Ghostwriter TinyMCE versions store inline evidence as:
<div class="richtext-evidence" data-evidence-id="4"></div>
instead of the older {{.friendly_name}} tag syntax.

Returns (resolved_text, set of evidence IDs that were embedded as images).
"""
if not ev_by_id or not text or 'richtext-evidence' not in text:
return text, set()

used_ids: set[int] = set()

def _replace(m: re.Match) -> str:
eid_str = m.group(1) or m.group(2)
ev = ev_by_id.get(int(eid_str))
if ev and ev.get("path"):
used_ids.add(ev["id"])
caption = html.escape(ev.get("caption") or ev.get("friendly_name") or eid_str)
path = html.escape(ev["path"])
return (
f'<figure>'
f'<img src="{path}" alt="{caption}" style="max-width:100%">'
f'<figcaption>{caption}</figcaption>'
f'</figure>'
)
return m.group(0) # unknown id — leave unchanged

return _GW_RICHTEXT_RE.sub(_replace, text), used_ids


class _TextExtractor(HTMLParser):
def __init__(self):
super().__init__()
Expand Down Expand Up @@ -61,6 +191,8 @@ def make_vue_data(raw: dict) -> dict:
normalised to None so templates can use a simple truthiness check to
conditionally render optional sections.
"""
ev_by_name, ev_by_id = _build_evidence_index(raw)

findings = []
for f in raw.get("findings") or []:
f = dict(f)
Expand All @@ -72,6 +204,22 @@ def make_vue_data(raw: dict) -> dict:
"score": float(f.get("cvss_score") or 0),
"vector": f.get("cvss_vector") or "n/a",
}
if ev_by_name or ev_by_id:
inline_ids: set[int] = set()
for field in _FINDING_TEXT_FIELDS:
raw_val = f.get(field)
if not isinstance(raw_val, str):
continue
resolved = raw_val
if "{{" in resolved:
resolved, used = _resolve_inline_evidence(resolved, ev_by_name)
inline_ids |= used
if "richtext-evidence" in resolved:
resolved, used = _resolve_richtext_evidence(resolved, ev_by_id)
inline_ids |= used
f[field] = resolved
if inline_ids and isinstance(f.get("evidence"), list):
f["evidence"] = [ev for ev in f["evidence"] if ev.get("id") not in inline_ids]
findings.append(f)

report = dict(raw)
Expand Down Expand Up @@ -107,5 +255,5 @@ def render_report(
bundle_js = BUNDLE.read_text("utf-8")
resources = build(template, report_json)

html = render_to_html(data, template_html, css, bundle_js, language, resources)
return render_to_pdf(html, resources)
rendered_html = render_to_html(data, template_html, css, bundle_js, language, resources)
return render_to_pdf(rendered_html, resources)
23 changes: 16 additions & 7 deletions app/reporting/evidence.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,23 +18,32 @@ def local_path(evidence_path: str) -> Path:
return _EVIDENCE_DIR / Path(evidence_path).relative_to("evidence")


def collect_paths(obj: object) -> dict[str, int]:
def collect_evidence(obj: object) -> list[dict]:
"""Recursively find all evidence objects in the report JSON.

Returns a mapping of path -> evidence_id, e.g. {"evidence/2/foo.png": 3}.
An evidence object is any dict with a 'path' starting with 'evidence/'
and an integer 'id'.
"""
paths: dict[str, int] = {}
found: list[dict] = []
if isinstance(obj, dict):
p = obj.get("path")
eid = obj.get("id")
if isinstance(p, str) and p.startswith("evidence/") and isinstance(eid, int):
paths[p] = eid
found.append(obj)
for v in obj.values():
paths |= collect_paths(v)
found.extend(collect_evidence(v))
elif isinstance(obj, list):
for item in obj:
paths |= collect_paths(item)
return paths
found.extend(collect_evidence(item))
return found


def collect_paths(obj: object) -> dict[str, int]:
"""Recursively find all evidence objects in the report JSON.

Returns a mapping of path -> evidence_id, e.g. {"evidence/2/foo.png": 3}.
"""
return {ev["path"]: ev["id"] for ev in collect_evidence(obj)}


def _fetch_and_save(client: GhostwriterClient, evidence_id: int, path: str, media_path: Path | None) -> tuple[str, bool]:
Expand Down
Loading