From bf274a2c87ac4a0ce68147f7240d376c19fcf81e Mon Sep 17 00:00:00 2001 From: Daniel Demmel Date: Sat, 27 Jun 2026 20:33:24 +0100 Subject: [PATCH 1/8] Escape raw HTML in assistant/tool content to fix XSS (#issue) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Transcript content containing raw HTML — e.g. a session where the user asks Claude to "write an E2E test that types into the field" — rendered those tags live into the HTML output. Opening the transcript fired the payloads (alert popups on every mention), and a malicious transcript could run arbitrary script. Root cause: the HTML renderer kept two markdown renderers — escape=True for user content (#119) and escape=False for assistant/tool/web content, on the assumption that the latter is trusted. It is not: the assistant routinely echoes arbitrary user/file/web input verbatim, and WebFetch results are raw web content. The Markdown output renderer already neutralises raw HTML from every source (_protect_html_tags); the HTML path was the lone, more dangerous, inconsistency. Fix: flip the shared render_markdown renderer to escape=True. This covers all affected paths at once (assistant prose, thinking, system messages, teammate bodies, Task/WebSearch/WebFetch results, plans). Markdown, code fences, Pygments highlighting and SHA links are unaffected; unsafe link/image schemes (javascript:, data:) were already neutralised by mistune. No snapshot churn — benign content renders identically. Tests (verified to go red on the pre-fix code): - test_markdown_rendering.py: shared renderer escapes raw HTML; assistant formatter end-to-end; javascript:/data: URL-scheme guard. - test_xss_browser.py: empirical Playwright test — opens the generated file in real Chromium and asserts no alert() dialog fires and no content-supplied nodes materialise in the DOM. Also documents the "all transcript content is untrusted" escaping contract for tool-renderer authors in dev-docs. --- claude_code_log/html/utils.py | 30 ++++- dev-docs/implementing-a-tool-renderer.md | 24 ++++ test/test_markdown_rendering.py | 76 +++++++++++ test/test_xss_browser.py | 163 +++++++++++++++++++++++ 4 files changed, 287 insertions(+), 6 deletions(-) create mode 100644 test/test_xss_browser.py diff --git a/claude_code_log/html/utils.py b/claude_code_log/html/utils.py index 632fa2c6..5d6ffb69 100644 --- a/claude_code_log/html/utils.py +++ b/claude_code_log/html/utils.py @@ -424,7 +424,24 @@ def block_code(code: str, info: Optional[str] = None) -> str: @functools.lru_cache(maxsize=1) def _get_markdown_renderer() -> mistune.Markdown: - """Get cached Mistune markdown renderer with Pygments syntax highlighting.""" + """Get cached Mistune markdown renderer with Pygments syntax highlighting. + + Uses ``escape=True`` so raw HTML embedded in the source text + (```` into the field" — + so rendering it unescaped lets that payload execute when the transcript + HTML is opened. The Markdown output path already neutralises raw HTML + from every source (see ``markdown/renderer.py::_protect_html_tags``); + the HTML path must match. ``escape=True`` does not affect Markdown + formatting, plugin output (Pygments, SHA links), or code fences — only + raw HTML tags in the body. + """ from ..markdown_plugins import make_codespan_sha_plugin, make_sha_plugin from ..git_remote import resolve_sha_for_current_render @@ -447,7 +464,7 @@ def _get_markdown_renderer() -> mistune.Markdown: # mistune's built-in rule consumes the backticks. make_codespan_sha_plugin(resolve_sha_for_current_render), ], - escape=False, # Don't escape HTML since we want to render markdown properly + escape=True, # Escape raw HTML: transcript content is untrusted (XSS) hard_wrap=True, # Line break for newlines (checklists in Assistant messages) ) @@ -686,10 +703,11 @@ def render_markdown_collapsible( For long content, creates a collapsible details element with a preview. For short content, renders inline with the specified CSS class. - Uses the ``escape=False`` renderer — for assistant/tool-authored content - (Task results, WebSearch/WebFetch, plans) that may emit pre-formed HTML. - For untrusted content (e.g. memory files), use - ``render_user_markdown_collapsible`` instead. + Renders via the shared HTML-escaping renderer (``render_markdown``), + so raw HTML in assistant/tool/web-authored content (Task results, + WebSearch/WebFetch, plans) is neutralised — transcript content is + untrusted (the assistant echoes arbitrary input). Markdown formatting, + Pygments highlighting and code fences are unaffected. Args: raw_content: The raw text content to render as markdown diff --git a/dev-docs/implementing-a-tool-renderer.md b/dev-docs/implementing-a-tool-renderer.md index 817b9f1c..9665181e 100644 --- a/dev-docs/implementing-a-tool-renderer.md +++ b/dev-docs/implementing-a-tool-renderer.md @@ -177,6 +177,30 @@ def format_websearch_output(output: WebSearchOutput) -> str: return render_markdown_collapsible(markdown_content, "websearch-results") ``` +### Escaping: all transcript content is untrusted + +Treat every value that comes out of a transcript as attacker-controlled. +The assistant routinely echoes arbitrary user/file/web input verbatim — a +prompt like *"write an E2E test that types `` into +the field"* lands that payload in assistant prose, a tool result, and a +Write tool's file content. If it reaches the HTML unescaped it executes when +the file is opened. There is no "trusted" source here. + +Two safe paths, depending on what you emit: + +- **Building HTML with f-strings/format** → run every interpolated value + through `escape_html()` first (as the input formatter above does with + `escape_query`). Never interpolate a raw field into markup. +- **Rendering markdown** → use `render_markdown` / `render_markdown_collapsible`. + Both use mistune with `escape=True`, so raw HTML tags in the body are + escaped to entities and unsafe link/image schemes (`javascript:`, `data:`) + are neutralised, while Markdown, code fences and Pygments still render. + +Regression coverage lives in `test/test_markdown_rendering.py` (unit) and +`test/test_xss_browser.py` (empirical: opens the file in a real browser and +asserts no `alert()` dialog fires). Add a payload-bearing case for any new +field you render. + ### Update Exports Add functions to `__all__`: diff --git a/test/test_markdown_rendering.py b/test/test_markdown_rendering.py index 597c3d65..9ad990e3 100644 --- a/test/test_markdown_rendering.py +++ b/test/test_markdown_rendering.py @@ -201,6 +201,82 @@ def test_render_user_markdown_escapes_html() -> None: assert "bold" in render_user_markdown("**bold**") +def test_render_markdown_escapes_html() -> None: + """The shared (assistant/tool/web-authored) renderer must escape raw HTML. + + Regression for the XSS where a transcript whose assistant text, tool + result, or fetched web content contained ``' + out = render_markdown(payload) + # No live markup reaches the DOM (escaped tags carry a dead + # ``onerror="`` but never a live ``onerror="`` on a real tag). + assert ")", + ): + out = render(src) + # The scheme is dangerous only as a live href/src attribute; + # appearing as visible link text is harmless. + assert 'href="javascript:' not in out, (render.__name__, src, out) + assert 'src="javascript:' not in out, (render.__name__, src, out) + assert 'href="data:' not in out, (render.__name__, src, out) + assert 'src="data:' not in out, (render.__name__, src, out) + + +def test_assistant_text_does_not_inject_live_html() -> None: + """End-to-end: assistant text with HTML must not emit live tags.""" + from claude_code_log.html.assistant_formatters import ( + format_assistant_text_content, + ) + from claude_code_log.models import ( + AssistantTextMessage, + MessageMeta, + TextContent, + ) + + meta = MessageMeta( + session_id="test-session", + timestamp="2024-01-01T00:00:00Z", + uuid="test-uuid", + ) + payload = "Sure! I'll enter into the field." + content = AssistantTextMessage(meta, items=[TextContent(type="text", text=payload)]) + out = format_assistant_text_content(content) + assert " into the +field"). See ``html/utils.py::_get_markdown_renderer`` for the escape policy. +""" + +from __future__ import annotations + +import json +from pathlib import Path + +import pytest +from playwright.sync_api import Page + +from claude_code_log.converter import load_transcript +from claude_code_log.html.renderer import generate_html + +# Auto-firing vectors: " +) + +_BASE = { + "cwd": "/app", + "isSidechain": False, + "sessionId": "11110000-0000-4000-8000-000000000001", + "userType": "external", + "version": "1.0.0", +} +_USAGE = {"input_tokens": 1, "output_tokens": 1} + + +def _write_transcript(path: Path) -> None: + rows = [ + { + **_BASE, + "type": "user", + "uuid": "u1", + "parentUuid": None, + "timestamp": "2026-06-27T10:00:00.000Z", + "message": { + "role": "user", + "content": f"Write an E2E test that types this: {PAYLOAD}", + }, + }, + { + **_BASE, + "type": "assistant", + "uuid": "a1", + "parentUuid": "u1", + "requestId": "r1", + "timestamp": "2026-06-27T10:00:01.000Z", + "message": { + "role": "assistant", + "model": "claude", + "id": "m1", + "type": "message", + "stop_reason": "end_turn", + "stop_sequence": None, + "usage": _USAGE, + "content": [ + {"type": "text", "text": f"Sure! I'll enter {PAYLOAD} there."}, + { + "type": "tool_use", + "id": "t1", + "name": "Write", + "input": { + "file_path": "/app/t.spec.ts", + "content": f"await page.fill('#in', '{PAYLOAD}');", + }, + }, + ], + }, + }, + { + **_BASE, + "type": "user", + "uuid": "u2", + "parentUuid": "a1", + "timestamp": "2026-06-27T10:00:02.000Z", + "message": { + "role": "user", + "content": [ + { + "type": "tool_result", + "tool_use_id": "t1", + "content": f"Wrote file containing {PAYLOAD}", + } + ], + }, + }, + ] + path.write_text("\n".join(json.dumps(r) for r in rows) + "\n", encoding="utf-8") + + +class TestTranscriptXss: + @pytest.mark.browser + def test_opening_transcript_fires_no_dialog( + self, page: Page, tmp_path: Path + ) -> None: + jsonl = tmp_path / "xss.jsonl" + _write_transcript(jsonl) + entries = load_transcript(jsonl, silent=True) + html_path = tmp_path / "xss.html" + html_path.write_text(generate_html(entries, "XSS"), encoding="utf-8") + + dialogs: list[str] = [] + + def _on_dialog(dialog: object) -> None: # pragma: no cover - event cb + dialogs.append(getattr(dialog, "message", "")) + dialog.dismiss() # type: ignore[attr-defined] + + page.on("dialog", _on_dialog) + page.goto(f"file://{html_path}") + # Give any onerror/script a tick to fire. + page.wait_for_timeout(200) + + assert dialogs == [], f"XSS executed — dialogs fired: {dialogs}" + + # And the payload is still visible to the reader, as escaped text. + body_text = page.inner_text("body") + assert "script-xss" in body_text + assert "alert('img-xss')" in body_text + + @pytest.mark.browser + def test_no_content_supplied_nodes_in_dom(self, page: Page, tmp_path: Path) -> None: + """The payload tags must not materialise as live DOM nodes.""" + jsonl = tmp_path / "xss.jsonl" + _write_transcript(jsonl) + entries = load_transcript(jsonl, silent=True) + html_path = tmp_path / "xss.html" + html_path.write_text(generate_html(entries, "XSS"), encoding="utf-8") + + page.goto(f"file://{html_path}") + + # No whose src is the payload's bogus "x" was injected, and no + # content-supplied leaked as a live element. + injected = page.evaluate( + "() => ({" + " imgs: document.querySelectorAll('img[src=\"x\"]').length," + " bolds: Array.from(document.querySelectorAll('b'))" + " .filter(b => b.textContent === 'bold').length," + "})" + ) + assert injected == {"imgs": 0, "bolds": 0}, injected From b3bda89f7342796f3d3f01c4b8df7f90f162980d Mon Sep 17 00:00:00 2001 From: Daniel Demmel Date: Sat, 27 Jun 2026 20:34:05 +0100 Subject: [PATCH 2/8] Configure mise to use the local .venv for uv Adds uv_venv_auto and pins the project venv path so mise/uv share the same .venv. --- mise.toml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mise.toml b/mise.toml index 7574397b..0852b277 100644 --- a/mise.toml +++ b/mise.toml @@ -1,2 +1,8 @@ [tools] python = "3.14" + +[settings] +python.uv_venv_auto = "source" + +[env] +_.python.venv = { path = ".venv" } \ No newline at end of file From ad03cb9fecb40bb3d7b187b52df8ab58a04222c2 Mon Sep 17 00:00:00 2001 From: Daniel Demmel Date: Sat, 27 Jun 2026 20:45:26 +0100 Subject: [PATCH 3/8] Address review: sync docstring, fix doc var name, use as_uri() - _markdown_collapsible docstring no longer describes the removed escape=False/escape=True split; both render fns now escape (XSS). - dev-docs: escape_query -> escaped_query to match the input formatter. - test_xss_browser: page.goto(html_path.as_uri()) instead of manually building a file:// URL (correct encoding, Windows-safe). --- claude_code_log/html/utils.py | 6 ++++-- dev-docs/implementing-a-tool-renderer.md | 2 +- test/test_xss_browser.py | 4 ++-- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/claude_code_log/html/utils.py b/claude_code_log/html/utils.py index 5d6ffb69..9673fd94 100644 --- a/claude_code_log/html/utils.py +++ b/claude_code_log/html/utils.py @@ -669,8 +669,10 @@ def _markdown_collapsible( preview_line_count: int, ) -> str: """Shared body for the collapsible-markdown helpers, parameterized by the - markdown render function (escape=False for assistant/tool output vs - escape=True for untrusted content).""" + markdown render function. Both render functions escape raw HTML + (``escape=True``): transcript content is untrusted regardless of source — + assistant/tool output routinely echoes arbitrary user/file/web input — so + raw tags are neutralised rather than injected as live DOM (XSS).""" rendered_html = render_fn(raw_content) lines = raw_content.splitlines() diff --git a/dev-docs/implementing-a-tool-renderer.md b/dev-docs/implementing-a-tool-renderer.md index 9665181e..782ce95f 100644 --- a/dev-docs/implementing-a-tool-renderer.md +++ b/dev-docs/implementing-a-tool-renderer.md @@ -190,7 +190,7 @@ Two safe paths, depending on what you emit: - **Building HTML with f-strings/format** → run every interpolated value through `escape_html()` first (as the input formatter above does with - `escape_query`). Never interpolate a raw field into markup. + `escaped_query`). Never interpolate a raw field into markup. - **Rendering markdown** → use `render_markdown` / `render_markdown_collapsible`. Both use mistune with `escape=True`, so raw HTML tags in the body are escaped to entities and unsafe link/image schemes (`javascript:`, `data:`) diff --git a/test/test_xss_browser.py b/test/test_xss_browser.py index 3941a6fe..cdbc2c8c 100644 --- a/test/test_xss_browser.py +++ b/test/test_xss_browser.py @@ -129,7 +129,7 @@ def _on_dialog(dialog: object) -> None: # pragma: no cover - event cb dialog.dismiss() # type: ignore[attr-defined] page.on("dialog", _on_dialog) - page.goto(f"file://{html_path}") + page.goto(html_path.as_uri()) # Give any onerror/script a tick to fire. page.wait_for_timeout(200) @@ -149,7 +149,7 @@ def test_no_content_supplied_nodes_in_dom(self, page: Page, tmp_path: Path) -> N html_path = tmp_path / "xss.html" html_path.write_text(generate_html(entries, "XSS"), encoding="utf-8") - page.goto(f"file://{html_path}") + page.goto(html_path.as_uri()) # No whose src is the payload's bogus "x" was injected, and no # content-supplied leaked as a live element. From eacfb880ac056bb5b3e139411829cb6647f0ff6c Mon Sep 17 00:00:00 2001 From: Christian Boos Date: Sun, 28 Jun 2026 19:55:26 +0200 Subject: [PATCH 4/8] Escape transcript-derived fields on the message-title path (XSS) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The body fix switched the shared Markdown renderer to escape=True, but the header title path is separate: the template renders `{{ message_title | safe }}` with no central escaping (titles legitimately emit structural HTML, e.g. the spawn-collapsed marker span), so each title_* method must escape its own interpolated field. Four did not — all transcript-derived and attacker-influenced: - generic / mcp__* / custom tool name (title_ToolUseMessage fallback) — the most severe: a tool_use with name `` rendered live in the header. Specialized tools were already escaped via _tool_title. - hook name (title_HookAttachmentMessage) - workflow phase title (title_WorkflowPhaseMessage) - workflow agent label (title_WorkflowAgentMessage) These four live on the shared base Renderer (also used by the Markdown renderer, which must not receive HTML-entity-escaped titles), so the escaping is added as HtmlRenderer overrides — mirroring how _tool_title already escapes the tool name for specialized tools. Also fix the now-stale `_get_user_markdown_renderer` docstring that still claimed assistant content uses escape=False (the shared renderer escapes now). Tests: test_xss_titles.py pins all four sinks (RED before, GREEN after); test_xss_browser.py gains a title-path payload (a generic tool_use whose name is the payload) — without the fix it fires script-xss/img-xss dialogs. Co-Authored-By: Claude Opus 4.8 (1M context) --- claude_code_log/html/renderer.py | 34 +++++++- claude_code_log/html/utils.py | 12 +-- test/test_xss_browser.py | 34 ++++++++ test/test_xss_titles.py | 131 +++++++++++++++++++++++++++++++ 4 files changed, 205 insertions(+), 6 deletions(-) create mode 100644 test/test_xss_titles.py diff --git a/claude_code_log/html/renderer.py b/claude_code_log/html/renderer.py index c34a4bca..68e1ca86 100644 --- a/claude_code_log/html/renderer.py +++ b/claude_code_log/html/renderer.py @@ -943,7 +943,15 @@ def title_ToolUseMessage( content.input, AskUserQuestionInput ) and self._paired_answer_supersedes(message): return "" - return super().title_ToolUseMessage(content, message) + # Specialized tools dispatch to a title_*Input method that escapes via + # ``_tool_title``. Tools with NO specialized method (generic / mcp__* / + # ToolSearch / custom) fall back to the raw tool name — which is + # attacker-controllable and lands live in the header span. Escape it + # here rather than in the shared base ``title_ToolUseMessage`` (the + # Markdown renderer must not get HTML-entity-escaped titles). #245 XSS. + if title := self._dispatch_title(content.input, message): + return title + return escape_html(content.tool_name) def title_ToolResultMessage( self, content: ToolResultMessage, message: TemplateMessage @@ -964,6 +972,30 @@ def title_ToolResultMessage( return f"{base} {marker}" if base else marker return base + # Title overrides that escape their transcript-derived field for the HTML + # header span. These titles are built on the shared base ``Renderer`` (also + # used by the Markdown renderer, which must NOT receive HTML-entity-escaped + # titles), so the escaping lives on the HTML path only — mirroring how + # ``_tool_title`` escapes the tool name for specialized tools. #245 XSS. + + def title_HookAttachmentMessage( + self, content: HookAttachmentMessage, _: TemplateMessage + ) -> str: + # ``hook_name`` (e.g. "PostToolUse:TaskUpdate") is transcript-derived + # and lands in the header; escape it. + label = content.hook_name or content.hook_event or content.kind + return f"Hook · {escape_html(label)}" + + def title_WorkflowPhaseMessage( + self, content: WorkflowPhaseMessage, _: TemplateMessage + ) -> str: + return f"Phase: {escape_html(content.title)}" if content.title else "Phase" + + def title_WorkflowAgentMessage( + self, content: WorkflowAgentMessage, _: TemplateMessage + ) -> str: + return f"Agent {escape_html(content.label)}" if content.label else "Agent" + def title_TaskInput(self, input: TaskInput, message: TemplateMessage) -> str: """Title → '🔧 Task (subagent_type) [async #]'. diff --git a/claude_code_log/html/utils.py b/claude_code_log/html/utils.py index 9673fd94..1575c75c 100644 --- a/claude_code_log/html/utils.py +++ b/claude_code_log/html/utils.py @@ -507,11 +507,13 @@ def render_markdown_inline(text: str) -> str: def _get_user_markdown_renderer() -> mistune.Markdown: """Markdown renderer for user-authored text. - Differs from the shared renderer in one critical way: ``escape=True`` - so a user typing raw `` into the field"). See ``html/utils.py::_get_markdown_renderer`` for the escape policy. + +It is ALSO placed in a TITLE field — a generic tool_use's ``name`` (#245): +the header title path (``{{ message_title | safe }}``) has no central +escaping, so a tool with no specialized title method renders its raw name +live in the header span. That channel is blind to the body-render escape and +needs the per-field ``escape_html`` on the HTML title methods. """ from __future__ import annotations @@ -107,6 +113,34 @@ def _write_transcript(path: Path) -> None: ], }, }, + { + # TITLE-path vector (#245): a tool with no specialized title method + # falls back to its raw name in the header span. The name IS the + # payload — auto-firing if the title isn't escaped. + **_BASE, + "type": "assistant", + "uuid": "a2", + "parentUuid": "u2", + "requestId": "r2", + "timestamp": "2026-06-27T10:00:03.000Z", + "message": { + "role": "assistant", + "model": "claude", + "id": "m2", + "type": "message", + "stop_reason": "end_turn", + "stop_sequence": None, + "usage": _USAGE, + "content": [ + { + "type": "tool_use", + "id": "t2", + "name": PAYLOAD, + "input": {"q": "noop"}, + } + ], + }, + }, ] path.write_text("\n".join(json.dumps(r) for r in rows) + "\n", encoding="utf-8") diff --git a/test/test_xss_titles.py b/test/test_xss_titles.py new file mode 100644 index 00000000..a8eb4f81 --- /dev/null +++ b/test/test_xss_titles.py @@ -0,0 +1,131 @@ +"""XSS: the message-TITLE path must escape transcript-derived fields (#245). + +The header renders ``{{ message_title | safe }}`` — titles legitimately carry +structural HTML (e.g. the spawn-collapsed ```` marker), so there is no +central escaping. Each ``title_*`` method that interpolates a transcript field +must therefore escape that field on the HTML path. daaain's PR secured the +message *body* (the shared Markdown renderer → ``escape=True``) but left four +title sinks unescaped; these pin them. + +The four sinks (all on the shared base ``Renderer``; the HTML renderer escapes +on its side only, so the Markdown renderer doesn't get HTML-entity-escaped +titles): + +1. generic / mcp__* / custom tool name — ``title_ToolUseMessage`` fallback +2. hook name — ``title_HookAttachmentMessage`` +3. workflow phase title — ``title_WorkflowPhaseMessage`` +4. workflow agent label — ``title_WorkflowAgentMessage`` +""" + +from __future__ import annotations + +import json +from pathlib import Path + +from claude_code_log.converter import load_transcript +from claude_code_log.html.renderer import HtmlRenderer, generate_html +from claude_code_log.models import ( + BashInput, + HookAttachmentMessage, + MessageMeta, + ToolUseMessage, + WorkflowAgentMessage, + WorkflowPhaseMessage, +) +from claude_code_log.renderer import TemplateMessage + +PAYLOAD = "" +ESCAPED = "<img src=x onerror=alert(1)>" + + +def _meta() -> MessageMeta: + return MessageMeta(uuid="u", session_id="s", timestamp="2025-01-01T00:00:00Z") + + +def _title(content) -> str: + msg = TemplateMessage(content) + return HtmlRenderer().title_content(msg) + + +class TestTitlePathEscaping: + def test_generic_tool_name_escaped(self, tmp_path: Path): + # A tool with no specialized title method (generic / mcp__* / custom) + # falls back to its raw name in the header. Exercise the real render + # path with the payload AS the tool name. + rows = [ + { + "type": "user", + "uuid": "u0", + "parentUuid": None, + "isSidechain": False, + "userType": "external", + "cwd": "/x", + "sessionId": "s1", + "version": "1.0", + "timestamp": "2025-01-01T00:00:00Z", + "message": {"role": "user", "content": "go"}, + }, + { + "type": "assistant", + "uuid": "a1", + "parentUuid": "u0", + "isSidechain": False, + "userType": "external", + "cwd": "/x", + "sessionId": "s1", + "version": "1.0", + "timestamp": "2025-01-01T00:00:01Z", + "requestId": "r1", + "message": { + "id": "m1", + "type": "message", + "role": "assistant", + "model": "claude", + "stop_reason": "tool_use", + "stop_sequence": None, + "usage": {"input_tokens": 1, "output_tokens": 1}, + "content": [ + {"type": "tool_use", "id": "t1", "name": PAYLOAD, "input": {}} + ], + }, + }, + ] + f = tmp_path / "x.jsonl" + f.write_text("\n".join(json.dumps(r) for r in rows), encoding="utf-8") + html = generate_html(load_transcript(f, silent=True), "x") + assert PAYLOAD not in html + assert ESCAPED in html + + def test_hook_name_escaped(self): + content = HookAttachmentMessage(meta=_meta(), kind="success", hook_name=PAYLOAD) + out = _title(content) + assert out.startswith("Hook · ") + assert PAYLOAD not in out + assert ESCAPED in out + + def test_workflow_phase_title_escaped(self): + content = WorkflowPhaseMessage(meta=_meta(), title=PAYLOAD) + out = _title(content) + assert out.startswith("Phase: ") + assert PAYLOAD not in out + assert ESCAPED in out + + def test_workflow_agent_label_escaped(self): + content = WorkflowAgentMessage(meta=_meta(), label=PAYLOAD) + out = _title(content) + assert out.startswith("Agent ") + assert PAYLOAD not in out + assert ESCAPED in out + + def test_specialized_tool_name_still_escaped_and_unbroken(self): + # Regression guard: the specialized path (BashInput → _tool_title) + # already escapes; a benign tool name renders normally. + content = ToolUseMessage( + meta=_meta(), + input=BashInput(command="ls", description="list"), + tool_use_id="t1", + tool_name="Bash", + ) + out = _title(content) + assert "Bash" in out + assert "<" not in out # nothing to escape, not mangled From c349e30568fc0f7b0217aa30f343db6a85332728 Mon Sep 17 00:00:00 2001 From: Christian Boos Date: Sun, 28 Jun 2026 20:14:19 +0200 Subject: [PATCH 5/8] Escape the System-level title sink + neutralise titles in Markdown output (XSS) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Round 2 of the title-path XSS hardening: - 5th HTML sink: title_SystemMessage rendered `System {level.title()}` raw. `level` is FREE-TEXT from the transcript (factory: `level or "info"`), not an enum, so a system entry with level `` rendered live in the header. Add an HtmlRenderer override that escapes — title-casing the raw level FIRST, then escaping (escaping first lets `.title()` capitalize the entity prefixes, `<` → `≪`, breaking them). - Markdown output: the format-neutral heading site emitted `# {title}` raw, so the same title fields (generic tool name, hook/phase/agent label, system level) reached the .md unescaped — making the PR's "Markdown neutralises raw HTML from every source" guarantee false. Neutralise the title there with the markdown-appropriate `_protect_html_tags` (not escape_html). Gate on a literal `<`: the mistune round-trip re-normalises markdown escaping (`\*\*` → `\**`), so titles with no tag must pass through untouched — avoids corrupting legit markdown titles while still closing the tag-injection path. Tests: test_xss_titles.py gains the System-level HTML case and a Markdown-path case (both RED before, GREEN after); the specialized-tool and no-churn snapshot guards confirm no collateral damage. Co-Authored-By: Claude Opus 4.8 (1M context) --- claude_code_log/html/renderer.py | 9 ++++ claude_code_log/markdown/renderer.py | 14 +++++- test/test_xss_titles.py | 71 ++++++++++++++++++++++++++++ 3 files changed, 93 insertions(+), 1 deletion(-) diff --git a/claude_code_log/html/renderer.py b/claude_code_log/html/renderer.py index 68e1ca86..6e7df37f 100644 --- a/claude_code_log/html/renderer.py +++ b/claude_code_log/html/renderer.py @@ -996,6 +996,15 @@ def title_WorkflowAgentMessage( ) -> str: return f"Agent {escape_html(content.label)}" if content.label else "Agent" + def title_SystemMessage(self, content: SystemMessage, _: TemplateMessage) -> str: + # ``level`` is FREE-TEXT from the transcript (``system_factory``: + # ``transcript.level or "info"``), not an enum — so it can carry a + # payload that lands in the header. Title-case the RAW level FIRST, + # then escape: escaping first would let ``.title()`` capitalize the + # entity prefixes (``<`` → ``≪``) and break the escaping. #245 XSS. + level = content.level or "unknown" + return f"System {escape_html(level.title())}" + def title_TaskInput(self, input: TaskInput, message: TemplateMessage) -> str: """Title → '🔧 Task (subagent_type) [async #]'. diff --git a/claude_code_log/markdown/renderer.py b/claude_code_log/markdown/renderer.py index c445bac6..59294a62 100644 --- a/claude_code_log/markdown/renderer.py +++ b/claude_code_log/markdown/renderer.py @@ -2049,7 +2049,19 @@ def _render_message(self, msg: TemplateMessage, level: int) -> str: if not suppress_heading: heading_level = min(level, 6) # Markdown max is h6 - parts.append(f"{'#' * heading_level} {title}") + # Neutralise raw HTML in the title (#245 XSS): transcript- + # derived title fields (generic tool name, hook name, workflow + # phase/agent label, system level) reach this format-neutral + # heading site unescaped. Use the markdown-appropriate + # ``_protect_html_tags`` (entity-escapes raw tags, preserves + # markdown) — not ``escape_html``, which would mangle markdown. + # Gate on a literal ``<`` (the only char that can open a tag): + # the mistune round-trip in ``_protect_html_tags`` re-normalises + # markdown escaping (e.g. ``\*\*`` → ``\**``), so a title with + # no tag must pass through untouched. Covers all the title + # sinks symmetrically with the HtmlRenderer escaping. + safe_title = _protect_html_tags(title) if "<" in title else title + parts.append(f"{'#' * heading_level} {safe_title}") # Per-message timestamp line (issue #160). Skip for # session headers (they have no meaningful per-msg time) # and when the heading was suppressed by `compact` mode diff --git a/test/test_xss_titles.py b/test/test_xss_titles.py index a8eb4f81..589a4582 100644 --- a/test/test_xss_titles.py +++ b/test/test_xss_titles.py @@ -24,10 +24,12 @@ from claude_code_log.converter import load_transcript from claude_code_log.html.renderer import HtmlRenderer, generate_html +from claude_code_log.markdown.renderer import MarkdownRenderer from claude_code_log.models import ( BashInput, HookAttachmentMessage, MessageMeta, + SystemMessage, ToolUseMessage, WorkflowAgentMessage, WorkflowPhaseMessage, @@ -117,6 +119,19 @@ def test_workflow_agent_label_escaped(self): assert PAYLOAD not in out assert ESCAPED in out + def test_system_level_escaped(self): + # ``level`` is free-text from the transcript, not an enum. The title is + # ``System {level.title()}``; the title-casing folds the tag name's + # case (````) but the tag would still fire (HTML attrs are + # case-insensitive), so it must be escaped AFTER ``.title()``. + content = SystemMessage(meta=_meta(), level=PAYLOAD, text="x") + out = _title(content) + assert out.startswith("System ") + # No live tag at any case; the dangerous ``<`` is entity-escaped, and + # ``.title()`` didn't corrupt the entity (no ``≪``). + assert "`` for a + downstream viewer to execute (#245).""" + + def test_generic_tool_name_protected_in_markdown_heading(self, tmp_path: Path): + rows = [ + { + "type": "user", + "uuid": "u0", + "parentUuid": None, + "isSidechain": False, + "userType": "external", + "cwd": "/x", + "sessionId": "s1", + "version": "1.0", + "timestamp": "2025-01-01T00:00:00Z", + "message": {"role": "user", "content": "go"}, + }, + { + "type": "assistant", + "uuid": "a1", + "parentUuid": "u0", + "isSidechain": False, + "userType": "external", + "cwd": "/x", + "sessionId": "s1", + "version": "1.0", + "timestamp": "2025-01-01T00:00:01Z", + "requestId": "r1", + "message": { + "id": "m1", + "type": "message", + "role": "assistant", + "model": "claude", + "stop_reason": "tool_use", + "stop_sequence": None, + "usage": {"input_tokens": 1, "output_tokens": 1}, + "content": [ + {"type": "tool_use", "id": "t1", "name": PAYLOAD, "input": {}} + ], + }, + }, + ] + f = tmp_path / "x.jsonl" + f.write_text("\n".join(json.dumps(r) for r in rows), encoding="utf-8") + md = MarkdownRenderer().generate(load_transcript(f, silent=True), "x") + # The raw tag must not survive into the heading… + assert f"# {PAYLOAD}" not in md + assert " Date: Mon, 29 Jun 2026 08:06:42 +0200 Subject: [PATCH 6/8] Route every Markdown heading through one HTML-neutralising gate (XSS) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Round 3 — close the Markdown "neutralises raw HTML from every source" guarantee honestly. The per-message heading was gated last round, but the page / project-index / session page headings still emitted transcript- reachable text raw: a crafted cwd flows through get_project_display_name (Path(cwd).name) into the `## {display_name}` index heading, and page titles derive from session summaries. Factor a single `_safe_md_heading(text)` helper (the `<`-gated `_protect_html_tags`) and route EVERY `#`/`##` heading through it — the per-message site plus the four page/project headings — so neutralisation is one auditable structural gate rather than a per-site convention that drifts when a new heading site is added. For the project link form, only the display_name fragment is neutralised so the link target is preserved. The HTML path is unchanged (its per-field HtmlRenderer title overrides are a separate, correct escaping path; the page /<h1> autoescape in the template). The `<`-gate keeps tag-free headings byte-identical — no snapshot churn. Test: a crafted-cwd → display_name → index heading regression (exercises the real path, not a synthetic title arg), plus the existing per-message and tool_name markdown cases. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> --- claude_code_log/markdown/renderer.py | 54 ++++++++++++++++++---------- test/test_xss_titles.py | 22 ++++++++++++ 2 files changed, 58 insertions(+), 18 deletions(-) diff --git a/claude_code_log/markdown/renderer.py b/claude_code_log/markdown/renderer.py index 59294a62..edf5e624 100644 --- a/claude_code_log/markdown/renderer.py +++ b/claude_code_log/markdown/renderer.py @@ -273,6 +273,30 @@ def _protect_html_tags(text: str) -> str: return str(rendered).rstrip("\n") +def _safe_md_heading(text: str) -> str: + """Neutralise raw HTML in a Markdown heading's text (#245 XSS). + + The single structural gate for EVERY ``#``/``##`` heading the Markdown + renderer emits — per-message titles AND the page/project/session page + headings. Transcript-reachable heading sources (generic tool names, hook / + workflow phase / agent labels, system ``level``, session summaries, and + project display names derived from ``cwd``) could otherwise carry a raw + ``<img onerror=…>`` into a ``.md`` heading for a downstream viewer to + execute. Routing every heading through one helper makes "neutralise raw + HTML at every heading" a structural property — one place to audit, can't be + forgotten when a new heading site is added — rather than a per-site + convention that drifts (the failure mode that produced this whole class). + + Gated on a literal ``<`` (the only char that can open a tag): the mistune + round-trip in ``_protect_html_tags`` re-normalises markdown escaping + (``\\*\\*`` → ``\\**``), so a heading with no tag must pass through + byte-identical (no churn, no collateral mangling). Markdown-appropriate + (entity-escape the tag, preserve markdown) — distinct from the HTML path, + which escapes per-field via the ``HtmlRenderer`` title overrides. + """ + return _protect_html_tags(text) if "<" in text else text + + def _render_expand_paths_tree(template_projects: list[Any]) -> list[str]: """Render `--expand-paths` Markdown index as a nested bullet-list directory tree. @@ -2049,19 +2073,9 @@ def _render_message(self, msg: TemplateMessage, level: int) -> str: if not suppress_heading: heading_level = min(level, 6) # Markdown max is h6 - # Neutralise raw HTML in the title (#245 XSS): transcript- - # derived title fields (generic tool name, hook name, workflow - # phase/agent label, system level) reach this format-neutral - # heading site unescaped. Use the markdown-appropriate - # ``_protect_html_tags`` (entity-escapes raw tags, preserves - # markdown) — not ``escape_html``, which would mangle markdown. - # Gate on a literal ``<`` (the only char that can open a tag): - # the mistune round-trip in ``_protect_html_tags`` re-normalises - # markdown escaping (e.g. ``\*\*`` → ``\**``), so a title with - # no tag must pass through untouched. Covers all the title - # sinks symmetrically with the HtmlRenderer escaping. - safe_title = _protect_html_tags(title) if "<" in title else title - parts.append(f"{'#' * heading_level} {safe_title}") + # Neutralise raw HTML in the title via the single heading gate + # (#245 XSS) — see ``_safe_md_heading``. + parts.append(f"{'#' * heading_level} {_safe_md_heading(title)}") # Per-message timestamp line (issue #160). Skip for # session headers (they have no meaningful per-msg time) # and when the heading was suppressed by `compact` mode @@ -2154,7 +2168,7 @@ def _generate_inner( } parts = [f"<!-- Generated by claude-code-log v{get_library_version()} -->", ""] - parts.append(f"# {title}") + parts.append(f"# {_safe_md_heading(title)}") # Table of Contents if session_nav: @@ -2237,7 +2251,7 @@ def generate_projects_index( template_projects, template_summary = prepare_projects_index(project_summaries) parts = [f"<!-- Generated by claude-code-log v{get_library_version()} -->", ""] - parts.append(f"# {title}") + parts.append(f"# {_safe_md_heading(title)}") # Summary stats parts.append( @@ -2264,11 +2278,15 @@ def generate_projects_index( # `--combined no` mode: header is a plain heading (no # link to the non-existent combined file); per-session # bullets link directly to `session-{id}.md` files. - parts.append(f"## {project.display_name}") + parts.append(f"## {_safe_md_heading(project.display_name)}") else: - # Derive markdown link from html_file path + # Derive markdown link from html_file path. Neutralise only the + # display_name fragment (transcript-reachable via cwd) so the + # link target is preserved. md_link = project.html_file.replace(".html", ".md") - parts.append(f"## [{project.display_name}]({md_link})") + parts.append( + f"## [{_safe_md_heading(project.display_name)}]({md_link})" + ) # Use actual session count (filtered) like HTML does session_count = ( len(project.sessions) if project.sessions else project.jsonl_count diff --git a/test/test_xss_titles.py b/test/test_xss_titles.py index 589a4582..99dabfb0 100644 --- a/test/test_xss_titles.py +++ b/test/test_xss_titles.py @@ -200,3 +200,25 @@ def test_generic_tool_name_protected_in_markdown_heading(self, tmp_path: Path): assert "<img" not in md.lower() # …it's entity-escaped instead. assert "<img" in md.lower() + + def test_project_display_name_protected_in_index_heading(self): + # Real path (not a synthetic title arg): a project's display_name is + # derived from its cwd — get_project_display_name returns Path(cwd).name + # — so a crafted cwd lands the payload in the `##` project heading of + # the Markdown index. Exercise that end to end. + project_summaries = [ + { + "name": "-home-u-proj", + "html_file": "proj/combined_transcripts.html", + "jsonl_count": 1, + "message_count": 1, + "last_modified": 1700000000.0, + "working_directories": [f"/home/u/{PAYLOAD}"], + } + ] + md = MarkdownRenderer().generate_projects_index(project_summaries) + # The crafted cwd basename IS the payload; the heading must neutralise + # it, not emit a raw tag. + assert "## [<img" not in md.lower() + assert "## <img" not in md.lower() + assert "<img" in md.lower() From 27e1d6edc03cbb2cccafcac64fb1c14c2687aedb Mon Sep 17 00:00:00 2001 From: Christian Boos <christian.boos@bct-technology.com> Date: Mon, 29 Jun 2026 19:00:29 +0200 Subject: [PATCH 7/8] Generalize the Markdown XSS gate to safe_markdown_inline (#245) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The heading gate closed the `#`/`##` heading sinks, but the Markdown output has a broader class of inline link-label/list surfaces that interpolate transcript-derived text raw — so "neutralise raw HTML from every source" wasn't yet true. Rename `_safe_md_heading` → `safe_markdown_inline` (now module-public) and route EVERY Markdown interpolation surface through it: the inline surfaces (expand-paths tree label, WebSearch result link title, TOC label, index session-link label) plus the headings it already covered. In `[label](url)` forms only the label fragment is neutralised — the destination is preserved. One auditable structural gate instead of a per-site convention that drifts; the `<`-guard keeps tag-free text byte-identical (no snapshot churn). Tests: a `safe_markdown_inline` gate unit + per-surface end-to-end regressions (WebSearch link, project index heading + session link, expand-paths label). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> --- claude_code_log/markdown/renderer.py | 59 ++++++------ test/test_xss_markdown_surfaces.py | 128 +++++++++++++++++++++++++++ 2 files changed, 162 insertions(+), 25 deletions(-) create mode 100644 test/test_xss_markdown_surfaces.py diff --git a/claude_code_log/markdown/renderer.py b/claude_code_log/markdown/renderer.py index edf5e624..3038fa89 100644 --- a/claude_code_log/markdown/renderer.py +++ b/claude_code_log/markdown/renderer.py @@ -273,26 +273,33 @@ def _protect_html_tags(text: str) -> str: return str(rendered).rstrip("\n") -def _safe_md_heading(text: str) -> str: - """Neutralise raw HTML in a Markdown heading's text (#245 XSS). - - The single structural gate for EVERY ``#``/``##`` heading the Markdown - renderer emits — per-message titles AND the page/project/session page - headings. Transcript-reachable heading sources (generic tool names, hook / - workflow phase / agent labels, system ``level``, session summaries, and - project display names derived from ``cwd``) could otherwise carry a raw - ``<img onerror=…>`` into a ``.md`` heading for a downstream viewer to - execute. Routing every heading through one helper makes "neutralise raw - HTML at every heading" a structural property — one place to audit, can't be - forgotten when a new heading site is added — rather than a per-site - convention that drifts (the failure mode that produced this whole class). +def safe_markdown_inline(text: str) -> str: + """Neutralise raw HTML in a Markdown inline-text fragment (#245 XSS). + + The single structural gate for EVERY markdown surface that interpolates + transcript-derived text into a position a downstream viewer would render as + markup — ``#``/``##`` headings (per-message titles + the page/project/ + session page headings) AND inline link labels / list items (the TOC label, + WebSearch result link titles, the project- and session-index link labels, + the expand-paths tree labels). Transcript-reachable sources (generic tool + names, hook / workflow phase / agent labels, system ``level``, session + summaries, project display names derived from ``cwd``, web result titles) + could otherwise carry a raw ``<img onerror=…>`` into the ``.md`` for a + permissive viewer to execute. Routing every such surface through ONE helper + makes "neutralise raw HTML from every source" a structural property — one + place to audit, can't be forgotten when a new surface is added — rather + than a per-site convention that drifts (the failure mode that produced this + whole class across several review rounds). + + Pass only the text FRAGMENT (the label/title/heading text), not a composed + ``[label](url)`` — the destination is preserved by the caller. Gated on a literal ``<`` (the only char that can open a tag): the mistune round-trip in ``_protect_html_tags`` re-normalises markdown escaping - (``\\*\\*`` → ``\\**``), so a heading with no tag must pass through + (``\\*\\*`` → ``\\**``), so a fragment with no tag must pass through byte-identical (no churn, no collateral mangling). Markdown-appropriate (entity-escape the tag, preserve markdown) — distinct from the HTML path, - which escapes per-field via the ``HtmlRenderer`` title overrides. + which escapes per-field via ``escape_html`` in the ``HtmlRenderer``. """ return _protect_html_tags(text) if "<" in text else text @@ -371,7 +378,7 @@ def _emit(node: dict[str, Any], depth: int) -> None: _emit(node[name], depth + 1) for label, url, ts in node.get("_links", []): ts_suffix = f" — *{ts}*" if ts else "" - lines.append(f"{indent}- [{label}]({url}){ts_suffix}") + lines.append(f"{indent}- [{safe_markdown_inline(label)}]({url}){ts_suffix}") _emit(root, 0) return lines @@ -1549,7 +1556,7 @@ def format_WebSearchOutput( parts.append("---") parts.append("") for link in output.links: - parts.append(f"- [{link.title}]({link.url})") + parts.append(f"- [{safe_markdown_inline(link.title)}]({link.url})") elif not output.summary: # Only show "no results" if there's also no summary parts.append("*No results found*") @@ -1995,7 +2002,7 @@ def _generate_toc(self, session_nav: list[dict[str, Any]]) -> str: if summary else f"Session `{session_short}`" ) - lines.append(f"- [{label}](#{anchor})") + lines.append(f"- [{safe_markdown_inline(label)}](#{anchor})") lines.append("") return "\n".join(lines) @@ -2074,8 +2081,8 @@ def _render_message(self, msg: TemplateMessage, level: int) -> str: if not suppress_heading: heading_level = min(level, 6) # Markdown max is h6 # Neutralise raw HTML in the title via the single heading gate - # (#245 XSS) — see ``_safe_md_heading``. - parts.append(f"{'#' * heading_level} {_safe_md_heading(title)}") + # (#245 XSS) — see ``safe_markdown_inline``. + parts.append(f"{'#' * heading_level} {safe_markdown_inline(title)}") # Per-message timestamp line (issue #160). Skip for # session headers (they have no meaningful per-msg time) # and when the heading was suppressed by `compact` mode @@ -2168,7 +2175,7 @@ def _generate_inner( } parts = [f"<!-- Generated by claude-code-log v{get_library_version()} -->", ""] - parts.append(f"# {_safe_md_heading(title)}") + parts.append(f"# {safe_markdown_inline(title)}") # Table of Contents if session_nav: @@ -2251,7 +2258,7 @@ def generate_projects_index( template_projects, template_summary = prepare_projects_index(project_summaries) parts = [f"<!-- Generated by claude-code-log v{get_library_version()} -->", ""] - parts.append(f"# {_safe_md_heading(title)}") + parts.append(f"# {safe_markdown_inline(title)}") # Summary stats parts.append( @@ -2278,14 +2285,14 @@ def generate_projects_index( # `--combined no` mode: header is a plain heading (no # link to the non-existent combined file); per-session # bullets link directly to `session-{id}.md` files. - parts.append(f"## {_safe_md_heading(project.display_name)}") + parts.append(f"## {safe_markdown_inline(project.display_name)}") else: # Derive markdown link from html_file path. Neutralise only the # display_name fragment (transcript-reachable via cwd) so the # link target is preserved. md_link = project.html_file.replace(".html", ".md") parts.append( - f"## [{_safe_md_heading(project.display_name)}]({md_link})" + f"## [{safe_markdown_inline(project.display_name)}]({md_link})" ) # Use actual session count (filtered) like HTML does session_count = ( @@ -2314,7 +2321,9 @@ def generate_projects_index( timestamp_suffix = ( f" — *{timestamp_range}*" if timestamp_range else "" ) - parts.append(f"- [{label}]({file_link}){timestamp_suffix}") + parts.append( + f"- [{safe_markdown_inline(label)}]({file_link}){timestamp_suffix}" + ) parts.append("") return "\n".join(parts) diff --git a/test/test_xss_markdown_surfaces.py b/test/test_xss_markdown_surfaces.py new file mode 100644 index 00000000..7f124b0e --- /dev/null +++ b/test/test_xss_markdown_surfaces.py @@ -0,0 +1,128 @@ +"""XSS: every Markdown interpolation surface routes through one gate (#245). + +Round-4 follow-up: the per-message + page/project/session headings were gated +last; this generalises the gate to `safe_markdown_inline` and routes the inline +link-label / list surfaces through it too — so "neutralise raw HTML from every +source" is a single structural property, not a per-site convention. + +Surfaces (markdown/renderer.py), each driven end-to-end here: +- WebSearch result link title → format_WebSearchOutput +- projects-index project heading + link → generate_projects_index +- per-project session-link label → generate_projects_index (combined off) +- expand-paths tree label → generate_projects_index (expand_paths) +- (TOC label + headings are pinned in test_xss_titles.py) +""" + +from __future__ import annotations + +from typing import Any + +from claude_code_log.markdown.renderer import MarkdownRenderer, safe_markdown_inline +from claude_code_log.models import ( + MessageMeta, + SystemMessage, + WebSearchLink, + WebSearchOutput, +) +from claude_code_log.renderer import TemplateMessage + +PAYLOAD = "<img src=x onerror=alert(1)>" + + +def _no_raw_tag(md: str) -> None: + assert "<img" not in md.lower(), md + assert "<img" in md.lower(), md + + +# ----------------------------- the gate itself ------------------------------- + + +class TestSafeMdInlineGate: + def test_tag_is_entity_escaped(self): + assert safe_markdown_inline(PAYLOAD) == "<img src=x onerror=alert(1)>" + + def test_plain_text_passes_through_byte_identical(self): + # No `<` → no mistune round-trip → markdown escaping not re-normalised. + for s in ("just text", "a **bold** label", r"escaped \*\*stars\*\*", ""): + assert safe_markdown_inline(s) == s + + +# ----------------------------- inline surfaces ------------------------------- + + +class TestMarkdownInlineSurfaces: + def test_websearch_link_title(self): + out = WebSearchOutput( + query="q", + links=[WebSearchLink(title=PAYLOAD, url="https://example.com")], + ) + # format_WebSearchOutput ignores the message arg; any real + # TemplateMessage satisfies the signature. + msg = TemplateMessage( + SystemMessage( + meta=MessageMeta( + uuid="u", session_id="s", timestamp="2025-01-01T00:00:00Z" + ), + level="info", + text="x", + ) + ) + md = MarkdownRenderer().format_WebSearchOutput(out, msg) + _no_raw_tag(md) + # The URL is preserved (only the label fragment is neutralised). + assert "(https://example.com)" in md + + def test_project_index_heading_and_session_link(self): + # combined_suppressed → plain `## {display_name}` heading + per-session + # `- [{summary}](file)` links. Payload in BOTH the cwd-derived + # display_name and a session summary. + project_summaries: list[dict[str, Any]] = [ + { + "name": "-home-u-proj", + "html_file": "proj/combined_transcripts.html", + "jsonl_count": 1, + "message_count": 1, + "last_modified": 1700000000.0, + "working_directories": [f"/home/u/{PAYLOAD}"], + "combined_suppressed": True, + "sessions": [ + { + "id": "11110000", + "file": "proj/session-11110000.md", + "summary": PAYLOAD, + } + ], + } + ] + md = MarkdownRenderer().generate_projects_index(project_summaries) + _no_raw_tag(md) + assert "## <img" not in md.lower() # heading neutralised + assert "- [<img" not in md.lower() # session-link label neutralised + assert "(proj/session-11110000.md)" in md # link target preserved + + def test_expand_paths_tree_label(self): + # expand_paths_tree mode builds the index as a directory tree whose + # leaf labels are session labels (summary-derived). + project_summaries: list[dict[str, Any]] = [ + { + "name": "-home-u-proj", + "html_file": "proj/combined_transcripts.html", + "jsonl_count": 1, + "message_count": 1, + "last_modified": 1700000000.0, + "working_directories": ["/home/u/proj"], + "combined_suppressed": True, + "sessions": [ + { + "id": "11110000", + "file": "proj/session-11110000.md", + "summary": PAYLOAD, + } + ], + } + ] + md = MarkdownRenderer().generate_projects_index( + project_summaries, expand_paths_tree=True + ) + _no_raw_tag(md) + assert "[<img" not in md.lower() From db6f2b004b6d5d4bff8f76cf0510d308cae0d331 Mon Sep 17 00:00:00 2001 From: Christian Boos <christian.boos@bct-technology.com> Date: Mon, 29 Jun 2026 19:00:40 +0200 Subject: [PATCH 8/8] Surface + document the plugin XSS contract (#245) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A plugin author told to "protect for XSS" could not succeed from the API or docs: the escaping helpers weren't on the stable plugin surface, and plugins.md had no security guidance. A plugin interpolating transcript data into title()/format_html/format_markdown reproduced the exact sinks this issue closed in the core. - API: re-export `escape_html` (format_html f-strings + a plugin's own title() escaping) and `safe_markdown_inline` (transcript text built into Markdown source) from `claude_code_log.plugins` — added to the PEP-562 lazy re-export + `__all__`, alongside `render_markdown[_collapsible]`. - Docs: plugins.md §4.2 "Security-conscious rendering" — untrusted-every-source rule, a helper-per-context table covering both output paths, a sink-class self-audit checklist, the one-structural-gate principle, and a cross-link to implementing-a-tool-renderer.md. Annotated the §4 method table (title / format_html / format_markdown rows) and changed the title()/format_markdown example from constants to interpolated-and-escaped so the risk is visible. Added a §9 XSS-payload test row. Tests: API re-export + behaviour; a plugin-author-shaped subclass whose render methods follow the contract → safe on BOTH the HTML and Markdown paths. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> --- claude_code_log/plugins.py | 39 ++++++++++---- dev-docs/plugins.md | 93 ++++++++++++++++++++++++++++---- test/test_plugin_xss_api.py | 105 ++++++++++++++++++++++++++++++++++++ 3 files changed, 217 insertions(+), 20 deletions(-) create mode 100644 test/test_plugin_xss_api.py diff --git a/claude_code_log/plugins.py b/claude_code_log/plugins.py index 8d1c9d2a..c915e188 100644 --- a/claude_code_log/plugins.py +++ b/claude_code_log/plugins.py @@ -39,7 +39,12 @@ # Visible to static type-checkers (pyright/mypy) and to # ``__all__`` validation; resolved at runtime via the # PEP-562 ``__getattr__`` further down. - from .html.utils import render_markdown, render_markdown_collapsible + from .html.utils import ( + escape_html, + render_markdown, + render_markdown_collapsible, + ) + from .markdown.renderer import safe_markdown_inline from .models import MessageContent, MessageMeta @@ -343,20 +348,34 @@ def apply_transformers( # documented signatures. _PUBLIC_HELPERS: frozenset[str] = frozenset( - {"render_markdown", "render_markdown_collapsible"} + { + "render_markdown", + "render_markdown_collapsible", + # Security helpers (#245): a plugin's ``format_html`` / ``title`` may + # interpolate transcript-derived (untrusted) data; without a surfaced + # primitive the author reproduces the title/markdown XSS sinks. See + # ``dev-docs/plugins.md`` §4 "Security-conscious rendering". + "escape_html", + "safe_markdown_inline", + } ) def __getattr__(name: str) -> Any: # PEP 562 if name in _PUBLIC_HELPERS: - from .html.utils import ( - render_markdown as _rm, - render_markdown_collapsible as _rmc, - ) + # ``safe_markdown_inline`` (the markdown renderer's inline HTML- + # neutralising gate — entity-escapes raw HTML tags in an inline + # markdown fragment, preserving markdown) lives in + # ``markdown/renderer.py``; the others in ``html/utils.py``. Resolved + # lazily to keep package init acyclic. + if name == "safe_markdown_inline": + from .markdown.renderer import safe_markdown_inline as resolved + else: + from .html import utils as _utils - globals()["render_markdown"] = _rm - globals()["render_markdown_collapsible"] = _rmc - return globals()[name] + resolved = getattr(_utils, name) + globals()[name] = resolved + return resolved raise AttributeError(f"module {__name__!r} has no attribute {name!r}") @@ -364,8 +383,10 @@ def __getattr__(name: str) -> Any: # PEP 562 "ENTRY_POINT_GROUP", "MessageTransformer", "apply_transformers", + "escape_html", "load_transformers", "render_markdown", "render_markdown_collapsible", "reset_cache", + "safe_markdown_inline", ] diff --git a/dev-docs/plugins.md b/dev-docs/plugins.md index f8136750..eb492008 100644 --- a/dev-docs/plugins.md +++ b/dev-docs/plugins.md @@ -212,6 +212,7 @@ renderer's dispatcher consults them after the renderer's own from dataclasses import dataclass from typing import ClassVar, Optional from claude_code_log.models import DetailLevel, ToolUseMessage +from claude_code_log.plugins import escape_html, safe_markdown_inline @dataclass class MyToolMessage(ToolUseMessage): @@ -220,23 +221,34 @@ class MyToolMessage(ToolUseMessage): detail_visibility: ClassVar[DetailLevel] = DetailLevel.LOW def format_markdown(self, _renderer, _message) -> str: + # ``action`` is transcript-derived (untrusted — see §4.2). Interpolated + # into Markdown SOURCE, so neutralise raw HTML with safe_markdown_inline + # (the Markdown-output path emits this verbatim). action = (self.input.input or {}).get("action", "?") - return f"_(my plugin) action={action}_" + return f"_(my plugin) action={safe_markdown_inline(action)}_" def format_html(self, _renderer, _message) -> Optional[str]: return None # fall back to mistune(format_markdown) def title(self, _renderer, _message) -> Optional[str]: - return "✉ my plugin" + # A title() return goes to ``{{ message_title | safe }}`` with NO core + # HTML escaping — escape transcript-derived interpolation yourself + # (§4.2). The Markdown heading path is auto-gated by the core. + action = (self.input.input or {}).get("action", "?") + return f"✉ my plugin: {escape_html(action)}" ``` +> The constant-string forms (`"✉ my plugin"`, `"_(my plugin)_"`) need no +> escaping — only **transcript-derived interpolation** does. See +> [§4.2](#42-security-conscious-rendering) for the full contract. + Signature contract for each method: | Method | Signature | Return | Notes | |---|---|---|---| -| `format_markdown` | `(self, renderer, message) -> str` | Markdown source string. | Define this whenever your class produces meaningful Markdown. Drives both Markdown output AND HTML output (via mistune) unless `format_html` is also defined. | -| `format_html` | `(self, renderer, message) -> str` | Raw HTML string (real string — no None sentinel). | Define this ONLY when you need HTML different from mistune-of-`format_markdown`. The dispatcher synthesizes that fallback automatically when `format_html` is absent. | -| `title` | `(self, renderer, message) -> Optional[str]` | Heading text or `None`. | Return `None` for "headless" (inline) messages. Return `""` (empty string, not None) to suppress the heading explicitly — the dispatcher distinguishes the two. | +| `format_markdown` | `(self, renderer, message) -> str` | Markdown source string. | Define this whenever your class produces meaningful Markdown. Drives both Markdown output AND HTML output (via mistune) unless `format_html` is also defined. **Escape transcript-derived interpolation** ([§4.2](#42-security-conscious-rendering)): the Markdown-output path emits this verbatim. | +| `format_html` | `(self, renderer, message) -> str` | Raw HTML string (real string — no None sentinel). | Define this ONLY when you need HTML different from mistune-of-`format_markdown`. The dispatcher synthesizes that fallback automatically when `format_html` is absent. **You own escaping** — the return is injected as live DOM; `escape_html` every transcript-derived interpolation ([§4.2](#42-security-conscious-rendering)). | +| `title` | `(self, renderer, message) -> Optional[str]` | Heading text or `None`. | Return `None` for "headless" (inline) messages. Return `""` (empty string, not None) to suppress the heading explicitly — the dispatcher distinguishes the two. **`escape_html` transcript-derived interpolation** — the title is emitted via `\| safe` with no core escaping ([§4.2](#42-security-conscious-rendering)). | **`format_html` is opt-in.** If your plugin class defines only `format_markdown`, the HtmlRenderer dispatcher automatically @@ -271,22 +283,27 @@ moves to the next ancestor. ### 4.1 Plugin-facing helpers -Two helpers are re-exported from `claude_code_log.plugins` for use -in `format_html` / `format_markdown` methods. The re-export is the -stable plugin API; the underlying implementation in -`claude_code_log/html/utils.py` may move or be renamed. +Four helpers are re-exported from `claude_code_log.plugins` for use +in `format_html` / `format_markdown` / `title` methods. The re-export is +the stable plugin API; the underlying implementation (in +`claude_code_log/html/utils.py` and `claude_code_log/markdown/renderer.py`) +may move or be renamed. ```python from claude_code_log.plugins import ( render_markdown, render_markdown_collapsible, + escape_html, + safe_markdown_inline, ) ``` | Helper | Signature | Use when | |---|---|---| -| `render_markdown(text)` | `(str) -> str` | You need Markdown→HTML inside a custom `format_html` (e.g. embedding a Markdown fragment in a richer HTML scaffold). | -| `render_markdown_collapsible(raw_content, css_class, *, line_threshold=20, preview_line_count=5)` | `(str, str, int, int) -> str` | Long Markdown bodies (mail bodies, agent responses, multi-paragraph result text). Returns inline `<div class="{css_class} markdown">…</div>` for short content, a collapsible `<details>` with preview + full body for content exceeding `line_threshold`. | +| `render_markdown(text)` | `(str) -> str` | You need Markdown→HTML inside a custom `format_html` (e.g. embedding a Markdown fragment in a richer HTML scaffold). Escapes raw HTML in `text` (untrusted-safe). | +| `render_markdown_collapsible(raw_content, css_class, *, line_threshold=20, preview_line_count=5)` | `(str, str, int, int) -> str` | Long Markdown bodies (mail bodies, agent responses, multi-paragraph result text). Returns inline `<div class="{css_class} markdown">…</div>` for short content, a collapsible `<details>` with preview + full body for content exceeding `line_threshold`. Escapes raw HTML. | +| `escape_html(text)` | `(str) -> str` | Interpolating transcript-derived text into a `format_html` raw-HTML string, OR into a `title` return (which is emitted via `\| safe`). Entity-escapes `<`, `>`, `&`, quotes. | +| `safe_markdown_inline(text)` | `(str) -> str` | Interpolating transcript-derived text into a Markdown **inline** fragment in `format_markdown` (a link label, a list item, inline prose) — the Markdown-output path emits `format_markdown` verbatim. Entity-escapes raw HTML tags while preserving Markdown formatting. | The reference plugin's [`tool_communicate_result.py`](../test/_plugins/clmail/src/claude_code_log_clmail_test/transformers/tool_communicate_result.py) @@ -297,6 +314,59 @@ Add to `claude_code_log.plugins.__all__` only on concrete plugin-author demand — every entry is an API commitment. Open an issue if a helper you need isn't exposed. +### 4.2 Security-conscious rendering + +**Every transcript-derived value is untrusted.** A transcript is not a +trusted document: the assistant routinely echoes arbitrary user / file / +web input verbatim ("write a test that types `<script>alert(1)</script>` +into the field"), and tool names, hook names, `cwd`-derived project names, +session summaries and the like are all attacker-influenceable. There is no +"trusted source" — if a value came from the transcript, escape it before it +reaches a rendered position. A plugin that interpolates transcript data into +its render methods reproduces the exact XSS sink class that issue #245 closed +in the core. + +**Helper per context** — pick by *where the value lands*, covering **both** +output paths: + +| Context (where the value lands) | Helper | +|---|---| +| `format_html` raw-HTML f-string (text or attribute) | `escape_html(value)` | +| `format_html` embedding a Markdown body | `render_markdown(value)` / `render_markdown_collapsible(value, …)` | +| `format_markdown` inline fragment (link label, list item, inline prose) | `safe_markdown_inline(value)` | +| `title()` return | `escape_html(value)` — the HTML header emits it via `\| safe` (no core escaping); the Markdown heading is auto-gated by the core | + +Notes: + +- **Don't double-escape.** Pass a value through *one* helper for its context. + Don't `escape_html` a value and then also feed it to `render_markdown` + (you'd get visible `&lt;` entities). A constant string needs no helper. +- **`format_markdown` HTML safety is automatic; Markdown safety is not.** + When only `format_markdown` is defined, the HTML path runs it through + `render_markdown` (escapes raw HTML for you). The Markdown-output path emits + `format_markdown` verbatim, so inline transcript interpolation there needs + `safe_markdown_inline`. + +**Sink-class self-audit checklist** — scan your render methods for: + +- a raw f-string interpolating transcript data into HTML (`format_html`) +- transcript data interpolated into a `title()` return +- transcript data in a Markdown heading or inline link/list label + (`format_markdown`) +- anything you emit into a `\| safe` / non-autoescaped context + +**Prefer one structural gate over per-site escaping.** The #245 class took +several review rounds precisely because each sink was individually "known" but +no single chokepoint enforced neutralisation, so new sinks kept slipping in. +The core now routes every Markdown heading/label through one +`safe_markdown_inline` gate. Apply the same principle in your plugin: if you +build several rendered strings from transcript data, funnel them through one +helper at one place rather than remembering to escape at each call site. + +For the core-renderer view of this contract (the per-field escaping the host +renderer applies, and the title/markdown gate internals), see +[implementing-a-tool-renderer.md](implementing-a-tool-renderer.md). + --- ## 5. Dispatch resolution order @@ -573,6 +643,7 @@ own plugin should follow the same shape: | **2. Dispatch matrix** | Renderer-side vs class-side resolution; HTML vs Markdown output | Skip unless your plugin does something exotic with the dispatcher. | | **3. Transformer integration** | End-to-end: real `MessageContent` through your `transform()` and class-side render methods | Always write this. Drive your transformer with hand-built `MessageMeta.empty()` candidates; assert the replacement is an instance of your subclass and that the render methods return the expected text. | | **4. Text-equivalence** | If your plugin reads `UserTextMessage.items`, assert that the joined text matches what the factory's `extract_text_content` produces | Recommended for any plugin keying on user text — protects you against future core refactors that sneak normalization between extraction and the items list. | +| **5. XSS payload** | Feed a payload (`<img src=x onerror=alert(1)>`) into every transcript-derived field your render methods interpolate, render BOTH paths, and assert the raw tag is escaped (not present verbatim) | Write this whenever your `format_html` / `format_markdown` / `title` interpolates transcript data ([§4.2](#42-security-conscious-rendering)). String-level: assert `"<img"` is absent and the escaped form is present, in both the HTML and Markdown output. | For an installable test plugin (your own or a fixture in your own repo), declare it as an editable dev-dependency and reset the loader diff --git a/test/test_plugin_xss_api.py b/test/test_plugin_xss_api.py new file mode 100644 index 00000000..1ba83cc6 --- /dev/null +++ b/test/test_plugin_xss_api.py @@ -0,0 +1,105 @@ +"""Plugin XSS API contract (#245 follow-up). + +Two things a plugin author needs and previously couldn't get from the stable +surface: + +1. The escaping helpers are re-exported from ``claude_code_log.plugins`` + (escape_html, safe_markdown_inline) alongside render_markdown[_collapsible]. +2. A plugin that follows the documented contract (dev-docs/plugins.md §4.2 — + escape transcript-derived interpolation, both output paths) produces safe + output on BOTH the HTML and Markdown render paths. +""" + +from __future__ import annotations + +from dataclasses import dataclass + +import claude_code_log.plugins as plugins +from claude_code_log.html.renderer import HtmlRenderer +from claude_code_log.markdown.renderer import MarkdownRenderer +from claude_code_log.models import MessageContent, MessageMeta +from claude_code_log.plugins import escape_html, safe_markdown_inline +from claude_code_log.renderer import TemplateMessage + +PAYLOAD = "<img src=x onerror=alert(1)>" + + +# ----------------------------- API re-export --------------------------------- + + +class TestPluginApiReExports: + def test_helpers_importable_and_in_all(self): + for name in ( + "render_markdown", + "render_markdown_collapsible", + "escape_html", + "safe_markdown_inline", + ): + assert hasattr(plugins, name), name + assert name in plugins.__all__, name + + def test_escape_html_behaviour(self): + assert escape_html(PAYLOAD) == "<img src=x onerror=alert(1)>" + + def test_safe_markdown_inline_behaviour(self): + assert safe_markdown_inline(PAYLOAD) == "<img src=x onerror=alert(1)>" + # Tag-free text passes through unchanged (no markdown re-normalisation). + assert safe_markdown_inline("a **bold** label") == "a **bold** label" + + +# ----------------------------- plugin-author contract ------------------------ + + +@dataclass +class _PluginContent(MessageContent): + """A plugin-style MessageContent that interpolates transcript-derived data + (``payload``) into all three render methods, escaping per dev-docs §4.2.""" + + payload: str = "" + + @property + def message_type(self) -> str: + return "plugin_demo" + + # HTML output: the return is injected as live DOM → escape_html. + def format_html(self, _renderer, _message) -> str: + return f"<div class='plug'>{escape_html(self.payload)}</div>" + + # Markdown output: emitted verbatim → safe_markdown_inline for inline text. + def format_markdown(self, _renderer, _message) -> str: + return f"plug: {safe_markdown_inline(self.payload)}" + + # title() goes to {{ message_title | safe }} (HTML, no core escaping) → + # escape_html; the Markdown heading path is additionally core-gated. + def title(self, _renderer, _message) -> str: + return f"Plug: {escape_html(self.payload)}" + + +def _msg() -> TemplateMessage: + content = _PluginContent( + meta=MessageMeta(uuid="u", session_id="s", timestamp="2025-01-01T00:00:00Z"), + payload=PAYLOAD, + ) + return TemplateMessage(content) + + +class TestPluginAuthorContractIsSafeBothPaths: + def test_html_format_escaped(self): + out = HtmlRenderer().format_content(_msg()) + assert "<img" not in out.lower() + assert "<img" in out.lower() + + def test_html_title_escaped(self): + out = HtmlRenderer().title_content(_msg()) + assert "<img" not in out.lower() + assert "<img" in out.lower() + + def test_markdown_format_escaped(self): + out = MarkdownRenderer().format_content(_msg()) + assert "<img" not in out.lower() + assert "<img" in out.lower() + + def test_markdown_title_escaped(self): + out = MarkdownRenderer().title_content(_msg()) + assert "<img" not in out.lower() + assert "<img" in out.lower()