Skip to content
43 changes: 42 additions & 1 deletion claude_code_log/html/renderer.py
Original file line number Diff line number Diff line change
Expand Up @@ -943,7 +943,15 @@ def title_ToolUseMessage(
content.input, AskUserQuestionInput
) and self._paired_answer_supersedes(message):
return ""
return super().title_ToolUseMessage(content, message)
# Specialized tools dispatch to a title_*Input method that escapes via
# ``_tool_title``. Tools with NO specialized method (generic / mcp__* /
# ToolSearch / custom) fall back to the raw tool name — which is
# attacker-controllable and lands live in the header span. Escape it
# here rather than in the shared base ``title_ToolUseMessage`` (the
# Markdown renderer must not get HTML-entity-escaped titles). #245 XSS.
if title := self._dispatch_title(content.input, message):
return title
return escape_html(content.tool_name)
Comment thread
coderabbitai[bot] marked this conversation as resolved.

def title_ToolResultMessage(
self, content: ToolResultMessage, message: TemplateMessage
Expand All @@ -964,6 +972,39 @@ def title_ToolResultMessage(
return f"{base} {marker}" if base else marker
return base

# Title overrides that escape their transcript-derived field for the HTML
# header span. These titles are built on the shared base ``Renderer`` (also
# used by the Markdown renderer, which must NOT receive HTML-entity-escaped
# titles), so the escaping lives on the HTML path only — mirroring how
# ``_tool_title`` escapes the tool name for specialized tools. #245 XSS.

def title_HookAttachmentMessage(
self, content: HookAttachmentMessage, _: TemplateMessage
) -> str:
# ``hook_name`` (e.g. "PostToolUse:TaskUpdate") is transcript-derived
# and lands in the header; escape it.
label = content.hook_name or content.hook_event or content.kind
return f"Hook · {escape_html(label)}"

def title_WorkflowPhaseMessage(
self, content: WorkflowPhaseMessage, _: TemplateMessage
) -> str:
return f"Phase: {escape_html(content.title)}" if content.title else "Phase"

def title_WorkflowAgentMessage(
self, content: WorkflowAgentMessage, _: TemplateMessage
) -> str:
return f"Agent {escape_html(content.label)}" if content.label else "Agent"

def title_SystemMessage(self, content: SystemMessage, _: TemplateMessage) -> str:
# ``level`` is FREE-TEXT from the transcript (``system_factory``:
# ``transcript.level or "info"``), not an enum — so it can carry a
# payload that lands in the header. Title-case the RAW level FIRST,
# then escape: escaping first would let ``.title()`` capitalize the
# entity prefixes (``<`` → ``≪``) and break the escaping. #245 XSS.
level = content.level or "unknown"
return f"System {escape_html(level.title())}"

def title_TaskInput(self, input: TaskInput, message: TemplateMessage) -> str:
"""Title → '🔧 Task <desc> (subagent_type) [async #<id>]'.

Expand Down
48 changes: 35 additions & 13 deletions claude_code_log/html/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -424,7 +424,24 @@ def block_code(code: str, info: Optional[str] = None) -> str:

@functools.lru_cache(maxsize=1)
def _get_markdown_renderer() -> mistune.Markdown:
"""Get cached Mistune markdown renderer with Pygments syntax highlighting."""
"""Get cached Mistune markdown renderer with Pygments syntax highlighting.

Uses ``escape=True`` so raw HTML embedded in the source text
(``<script>``, ``<img onerror=…>``, bare ``<b>``, …) is rendered as
literal entity-escaped text rather than injected as live DOM.

This renderer handles assistant/tool/web-authored content (assistant
prose, Task/WebSearch/WebFetch results, plans, system messages,
teammate bodies). That content is **not** trusted: the assistant
routinely echoes arbitrary user/file/web input verbatim — e.g. "write
an E2E test that types ``<script>alert(1)</script>`` into the field" —
so rendering it unescaped lets that payload execute when the transcript
HTML is opened. The Markdown output path already neutralises raw HTML
from every source (see ``markdown/renderer.py::_protect_html_tags``);
the HTML path must match. ``escape=True`` does not affect Markdown
formatting, plugin output (Pygments, SHA links), or code fences — only
raw HTML tags in the body.
"""
from ..markdown_plugins import make_codespan_sha_plugin, make_sha_plugin
from ..git_remote import resolve_sha_for_current_render

Expand All @@ -447,7 +464,7 @@ def _get_markdown_renderer() -> mistune.Markdown:
# mistune's built-in rule consumes the backticks.
make_codespan_sha_plugin(resolve_sha_for_current_render),
],
escape=False, # Don't escape HTML since we want to render markdown properly
escape=True, # Escape raw HTML: transcript content is untrusted (XSS)
Comment thread
coderabbitai[bot] marked this conversation as resolved.
hard_wrap=True, # Line break for newlines (checklists in Assistant messages)
)

Expand Down Expand Up @@ -490,11 +507,13 @@ def render_markdown_inline(text: str) -> str:
def _get_user_markdown_renderer() -> mistune.Markdown:
"""Markdown renderer for user-authored text.

Differs from the shared renderer in one critical way: ``escape=True``
so a user typing raw ``<script>`` or other HTML sees the literal
characters rendered as code, not injected into the DOM. Assistant
content uses ``escape=False`` deliberately (tool output renders
pre-formed HTML); user content must not bypass escaping.
Uses ``escape=True`` so raw ``<script>`` or other HTML in the source is
rendered as literal escaped text, not injected into the DOM. The shared
renderer (``_get_markdown_renderer``) was historically ``escape=False``
for assistant/tool output that emitted pre-formed HTML; it now also
escapes — transcript content is untrusted from every source (#245 XSS),
so both pipelines neutralise raw HTML. This one is retained for the
user-content call sites (``render_user_markdown``).
"""
from ..markdown_plugins import make_codespan_sha_plugin, make_sha_plugin
from ..git_remote import resolve_sha_for_current_render
Expand Down Expand Up @@ -652,8 +671,10 @@ def _markdown_collapsible(
preview_line_count: int,
) -> str:
"""Shared body for the collapsible-markdown helpers, parameterized by the
markdown render function (escape=False for assistant/tool output vs
escape=True for untrusted content)."""
markdown render function. Both render functions escape raw HTML
(``escape=True``): transcript content is untrusted regardless of source —
assistant/tool output routinely echoes arbitrary user/file/web input — so
raw tags are neutralised rather than injected as live DOM (XSS)."""
rendered_html = render_fn(raw_content)

lines = raw_content.splitlines()
Expand Down Expand Up @@ -686,10 +707,11 @@ def render_markdown_collapsible(
For long content, creates a collapsible details element with a preview.
For short content, renders inline with the specified CSS class.

Uses the ``escape=False`` renderer — for assistant/tool-authored content
(Task results, WebSearch/WebFetch, plans) that may emit pre-formed HTML.
For untrusted content (e.g. memory files), use
``render_user_markdown_collapsible`` instead.
Renders via the shared HTML-escaping renderer (``render_markdown``),
so raw HTML in assistant/tool/web-authored content (Task results,
WebSearch/WebFetch, plans) is neutralised — transcript content is
untrusted (the assistant echoes arbitrary input). Markdown formatting,
Pygments highlighting and code fences are unaffected.

Args:
raw_content: The raw text content to render as markdown
Expand Down
59 changes: 49 additions & 10 deletions claude_code_log/markdown/renderer.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,37 @@ def _protect_html_tags(text: str) -> str:
return str(rendered).rstrip("\n")


def safe_markdown_inline(text: str) -> str:
"""Neutralise raw HTML in a Markdown inline-text fragment (#245 XSS).

The single structural gate for EVERY markdown surface that interpolates
transcript-derived text into a position a downstream viewer would render as
markup — ``#``/``##`` headings (per-message titles + the page/project/
session page headings) AND inline link labels / list items (the TOC label,
WebSearch result link titles, the project- and session-index link labels,
the expand-paths tree labels). Transcript-reachable sources (generic tool
names, hook / workflow phase / agent labels, system ``level``, session
summaries, project display names derived from ``cwd``, web result titles)
could otherwise carry a raw ``<img onerror=…>`` into the ``.md`` for a
permissive viewer to execute. Routing every such surface through ONE helper
makes "neutralise raw HTML from every source" a structural property — one
place to audit, can't be forgotten when a new surface is added — rather
than a per-site convention that drifts (the failure mode that produced this
whole class across several review rounds).

Pass only the text FRAGMENT (the label/title/heading text), not a composed
``[label](url)`` — the destination is preserved by the caller.

Gated on a literal ``<`` (the only char that can open a tag): the mistune
round-trip in ``_protect_html_tags`` re-normalises markdown escaping
(``\\*\\*`` → ``\\**``), so a fragment with no tag must pass through
byte-identical (no churn, no collateral mangling). Markdown-appropriate
(entity-escape the tag, preserve markdown) — distinct from the HTML path,
which escapes per-field via ``escape_html`` in the ``HtmlRenderer``.
"""
return _protect_html_tags(text) if "<" in text else text


def _render_expand_paths_tree(template_projects: list[Any]) -> list[str]:
"""Render `--expand-paths` Markdown index as a nested bullet-list
directory tree.
Expand Down Expand Up @@ -347,7 +378,7 @@ def _emit(node: dict[str, Any], depth: int) -> None:
_emit(node[name], depth + 1)
for label, url, ts in node.get("_links", []):
ts_suffix = f" — *{ts}*" if ts else ""
lines.append(f"{indent}- [{label}]({url}){ts_suffix}")
lines.append(f"{indent}- [{safe_markdown_inline(label)}]({url}){ts_suffix}")

_emit(root, 0)
return lines
Expand Down Expand Up @@ -1525,7 +1556,7 @@ def format_WebSearchOutput(
parts.append("---")
parts.append("")
for link in output.links:
parts.append(f"- [{link.title}]({link.url})")
parts.append(f"- [{safe_markdown_inline(link.title)}]({link.url})")
elif not output.summary:
# Only show "no results" if there's also no summary
parts.append("*No results found*")
Expand Down Expand Up @@ -1971,7 +2002,7 @@ def _generate_toc(self, session_nav: list[dict[str, Any]]) -> str:
if summary
else f"Session `{session_short}`"
)
lines.append(f"- [{label}](#{anchor})")
lines.append(f"- [{safe_markdown_inline(label)}](#{anchor})")
lines.append("")
return "\n".join(lines)

Expand Down Expand Up @@ -2049,7 +2080,9 @@ def _render_message(self, msg: TemplateMessage, level: int) -> str:

if not suppress_heading:
heading_level = min(level, 6) # Markdown max is h6
parts.append(f"{'#' * heading_level} {title}")
# Neutralise raw HTML in the title via the single heading gate
# (#245 XSS) — see ``safe_markdown_inline``.
parts.append(f"{'#' * heading_level} {safe_markdown_inline(title)}")
# Per-message timestamp line (issue #160). Skip for
# session headers (they have no meaningful per-msg time)
# and when the heading was suppressed by `compact` mode
Expand Down Expand Up @@ -2142,7 +2175,7 @@ def _generate_inner(
}

parts = [f"<!-- Generated by claude-code-log v{get_library_version()} -->", ""]
parts.append(f"# {title}")
parts.append(f"# {safe_markdown_inline(title)}")

# Table of Contents
if session_nav:
Expand Down Expand Up @@ -2225,7 +2258,7 @@ def generate_projects_index(
template_projects, template_summary = prepare_projects_index(project_summaries)

parts = [f"<!-- Generated by claude-code-log v{get_library_version()} -->", ""]
parts.append(f"# {title}")
parts.append(f"# {safe_markdown_inline(title)}")

# Summary stats
parts.append(
Expand All @@ -2252,11 +2285,15 @@ def generate_projects_index(
# `--combined no` mode: header is a plain heading (no
# link to the non-existent combined file); per-session
# bullets link directly to `session-{id}.md` files.
parts.append(f"## {project.display_name}")
parts.append(f"## {safe_markdown_inline(project.display_name)}")
else:
# Derive markdown link from html_file path
# Derive markdown link from html_file path. Neutralise only the
# display_name fragment (transcript-reachable via cwd) so the
# link target is preserved.
md_link = project.html_file.replace(".html", ".md")
parts.append(f"## [{project.display_name}]({md_link})")
parts.append(
f"## [{safe_markdown_inline(project.display_name)}]({md_link})"
)
# Use actual session count (filtered) like HTML does
session_count = (
len(project.sessions) if project.sessions else project.jsonl_count
Expand Down Expand Up @@ -2284,7 +2321,9 @@ def generate_projects_index(
timestamp_suffix = (
f" — *{timestamp_range}*" if timestamp_range else ""
)
parts.append(f"- [{label}]({file_link}){timestamp_suffix}")
parts.append(
f"- [{safe_markdown_inline(label)}]({file_link}){timestamp_suffix}"
)
parts.append("")

return "\n".join(parts)
Expand Down
39 changes: 30 additions & 9 deletions claude_code_log/plugins.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,12 @@
# Visible to static type-checkers (pyright/mypy) and to
# ``__all__`` validation; resolved at runtime via the
# PEP-562 ``__getattr__`` further down.
from .html.utils import render_markdown, render_markdown_collapsible
from .html.utils import (
escape_html,
render_markdown,
render_markdown_collapsible,
)
from .markdown.renderer import safe_markdown_inline

from .models import MessageContent, MessageMeta

Expand Down Expand Up @@ -343,29 +348,45 @@ def apply_transformers(
# documented signatures.

_PUBLIC_HELPERS: frozenset[str] = frozenset(
{"render_markdown", "render_markdown_collapsible"}
{
"render_markdown",
"render_markdown_collapsible",
# Security helpers (#245): a plugin's ``format_html`` / ``title`` may
# interpolate transcript-derived (untrusted) data; without a surfaced
# primitive the author reproduces the title/markdown XSS sinks. See
# ``dev-docs/plugins.md`` §4 "Security-conscious rendering".
"escape_html",
"safe_markdown_inline",
}
)


def __getattr__(name: str) -> Any: # PEP 562
if name in _PUBLIC_HELPERS:
from .html.utils import (
render_markdown as _rm,
render_markdown_collapsible as _rmc,
)
# ``safe_markdown_inline`` (the markdown renderer's inline HTML-
# neutralising gate — entity-escapes raw HTML tags in an inline
# markdown fragment, preserving markdown) lives in
# ``markdown/renderer.py``; the others in ``html/utils.py``. Resolved
# lazily to keep package init acyclic.
if name == "safe_markdown_inline":
from .markdown.renderer import safe_markdown_inline as resolved
else:
from .html import utils as _utils

globals()["render_markdown"] = _rm
globals()["render_markdown_collapsible"] = _rmc
return globals()[name]
resolved = getattr(_utils, name)
globals()[name] = resolved
return resolved
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")


__all__ = [
"ENTRY_POINT_GROUP",
"MessageTransformer",
"apply_transformers",
"escape_html",
"load_transformers",
"render_markdown",
"render_markdown_collapsible",
"reset_cache",
"safe_markdown_inline",
]
24 changes: 24 additions & 0 deletions dev-docs/implementing-a-tool-renderer.md
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,30 @@ def format_websearch_output(output: WebSearchOutput) -> str:
return render_markdown_collapsible(markdown_content, "websearch-results")
```

### Escaping: all transcript content is untrusted

Treat every value that comes out of a transcript as attacker-controlled.
The assistant routinely echoes arbitrary user/file/web input verbatim — a
prompt like *"write an E2E test that types `<script>alert(1)</script>` into
the field"* lands that payload in assistant prose, a tool result, and a
Write tool's file content. If it reaches the HTML unescaped it executes when
the file is opened. There is no "trusted" source here.

Two safe paths, depending on what you emit:

- **Building HTML with f-strings/format** → run every interpolated value
through `escape_html()` first (as the input formatter above does with
`escaped_query`). Never interpolate a raw field into markup.
- **Rendering markdown** → use `render_markdown` / `render_markdown_collapsible`.
Both use mistune with `escape=True`, so raw HTML tags in the body are
escaped to entities and unsafe link/image schemes (`javascript:`, `data:`)
are neutralised, while Markdown, code fences and Pygments still render.

Regression coverage lives in `test/test_markdown_rendering.py` (unit) and
`test/test_xss_browser.py` (empirical: opens the file in a real browser and
asserts no `alert()` dialog fires). Add a payload-bearing case for any new
field you render.

Comment thread
coderabbitai[bot] marked this conversation as resolved.
### Update Exports

Add functions to `__all__`:
Expand Down
Loading
Loading