Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 56 additions & 2 deletions core/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import json
import logging
import os
import re
from typing import Any, AsyncIterator

import httpx
Expand Down Expand Up @@ -531,6 +532,55 @@ def _extract_responses_result(response: Any) -> dict[str, Any]:
return {"content": content, "tool_calls": tool_calls, "raw": response}


_REASONING_PATTERNS = [
# DeepSeek/Qwen-style reasoning tags, if they surface inline in `content`.
re.compile(r"<think>.*?</think>", re.DOTALL | re.IGNORECASE),
# Gemma 4 native reasoning channel — <|channel>thought ... <channel|>.
# A well-behaved Gemma 4 emits these special tokens; llama.cpp does not
# parse them, so they leak into `content`.
re.compile(r"<\|?channel\|?>\s*thought\b.*?<\|?channel\|?>", re.DOTALL | re.IGNORECASE),
# Markdown-fenced reasoning block — ```thought ... ``` — the form actually
# observed in production: an abliterated Gemma 4 merge degrades special-token
# adherence and approximates its thought channel as a plaintext fence.
re.compile(r"```(?:thought|thinking|reasoning)\b.*?```", re.DOTALL | re.IGNORECASE),
]


def strip_reasoning(content: str) -> str:
"""
Remove leaked chain-of-thought from model output.

Some models — abliterated community merges in particular — emit their
reasoning trace as visible content instead of through a separate reasoning
channel: either ``<think>...</think>`` tags or a ```thought fenced block.
Hexis never wants that text in a user-facing reply.

Model-agnostic and a no-op when the content is already clean, so it is safe
to apply unconditionally regardless of which model/provider is configured.
"""
if not content:
return content
lowered = content.lower()
if "```" not in content and "<think>" not in lowered and "channel" not in lowered:
return content
cleaned = content
for pat in _REASONING_PATTERNS:
cleaned = pat.sub("", cleaned)
cleaned = cleaned.strip()
if cleaned == content:
return content
if not cleaned:
logger.warning(
"strip_reasoning: model output was entirely reasoning trace, "
"nothing left after strip (%d chars removed)", len(content),
)
return ""
logger.info(
"strip_reasoning: removed %d chars of leaked reasoning", len(content) - len(cleaned),
)
return cleaned


_PROVIDER_ALIASES: dict[str, str] = {
"openai_chat_completions_endpoint": "openai-chat-completions-endpoint",
"openai_codex": "openai-codex",
Expand Down Expand Up @@ -1004,7 +1054,7 @@ async def _do_gemini_completion():
async def _do_chat_completion():
response = await client.chat.completions.create(**payload)
message = response.choices[0].message
content = message.content or ""
content = strip_reasoning(message.content or "")
tool_calls = _openai_tool_calls(message.tool_calls or [])
return {"content": content, "tool_calls": tool_calls, "raw": response}

Expand Down Expand Up @@ -1263,7 +1313,11 @@ async def _do_stream_completion():
logger.debug("Failed to parse tool arguments: %r", raw_args[:200])
args = {}
tool_calls.append({"id": tc["id"], "name": tc["name"], "arguments": args})
return {"content": "".join(content_parts), "tool_calls": tool_calls, "raw": None}
return {
"content": strip_reasoning("".join(content_parts)),
"tool_calls": tool_calls,
"raw": None,
}
Comment on lines +1316 to +1320

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | 🏗️ Heavy lift

Streaming callback path can still leak reasoning tokens.

Final return content is sanitized here, but on_text_delta(...) receives raw chunk text earlier in the same function, so leaked reasoning can still be emitted live to clients.

💡 Fix direction
-                if delta and delta.content:
-                    content_parts.append(delta.content)
-                    if on_text_delta:
+                if delta and delta.content:
+                    content_parts.append(delta.content)
+                    if on_text_delta:
                         import asyncio
-                        result = on_text_delta(delta.content)
+                        # Emit only sanitized text (stateful buffering may be needed
+                        # to avoid partial-pattern leaks across chunk boundaries).
+                        sanitized_so_far = strip_reasoning("".join(content_parts))
+                        already_emitted = strip_reasoning("".join(content_parts[:-1]))
+                        safe_delta = sanitized_so_far[len(already_emitted):]
+                        result = on_text_delta(safe_delta) if safe_delta else None
                         if asyncio.iscoroutine(result):
                             await result
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@core/llm.py` around lines 1316 - 1320, The streaming path currently emits raw
chunks via on_text_delta(...) which can leak reasoning even though the final
return uses strip_reasoning; modify the streaming logic inside the same function
so that each chunk sent to on_text_delta is sanitized (e.g., run through
strip_reasoning or a streaming-safe sanitizer) before emitting, or buffer
incremental chunks and apply strip_reasoning prior to any on_text_delta call;
update references to on_text_delta, strip_reasoning, and content_parts
accordingly so live clients never receive unstripped reasoning tokens.


return await _retry_on_transient(_do_stream_completion)

Expand Down