diff --git a/core/llm.py b/core/llm.py index 798f0e8d..a3cf7041 100644 --- a/core/llm.py +++ b/core/llm.py @@ -4,6 +4,7 @@ import json import logging import os +import re from typing import Any, AsyncIterator import httpx @@ -531,6 +532,55 @@ def _extract_responses_result(response: Any) -> dict[str, Any]: return {"content": content, "tool_calls": tool_calls, "raw": response} +_REASONING_PATTERNS = [ + # DeepSeek/Qwen-style reasoning tags, if they surface inline in `content`. + re.compile(r".*?", re.DOTALL | re.IGNORECASE), + # Gemma 4 native reasoning channel — <|channel>thought ... . + # A well-behaved Gemma 4 emits these special tokens; llama.cpp does not + # parse them, so they leak into `content`. + re.compile(r"<\|?channel\|?>\s*thought\b.*?<\|?channel\|?>", re.DOTALL | re.IGNORECASE), + # Markdown-fenced reasoning block — ```thought ... ``` — the form actually + # observed in production: an abliterated Gemma 4 merge degrades special-token + # adherence and approximates its thought channel as a plaintext fence. + re.compile(r"```(?:thought|thinking|reasoning)\b.*?```", re.DOTALL | re.IGNORECASE), +] + + +def strip_reasoning(content: str) -> str: + """ + Remove leaked chain-of-thought from model output. + + Some models — abliterated community merges in particular — emit their + reasoning trace as visible content instead of through a separate reasoning + channel: either ``...`` tags or a ```thought fenced block. + Hexis never wants that text in a user-facing reply. + + Model-agnostic and a no-op when the content is already clean, so it is safe + to apply unconditionally regardless of which model/provider is configured. + """ + if not content: + return content + lowered = content.lower() + if "```" not in content and "" not in lowered and "channel" not in lowered: + return content + cleaned = content + for pat in _REASONING_PATTERNS: + cleaned = pat.sub("", cleaned) + cleaned = cleaned.strip() + if cleaned == content: + return content + if not cleaned: + logger.warning( + "strip_reasoning: model output was entirely reasoning trace, " + "nothing left after strip (%d chars removed)", len(content), + ) + return "" + logger.info( + "strip_reasoning: removed %d chars of leaked reasoning", len(content) - len(cleaned), + ) + return cleaned + + _PROVIDER_ALIASES: dict[str, str] = { "openai_chat_completions_endpoint": "openai-chat-completions-endpoint", "openai_codex": "openai-codex", @@ -1004,7 +1054,7 @@ async def _do_gemini_completion(): async def _do_chat_completion(): response = await client.chat.completions.create(**payload) message = response.choices[0].message - content = message.content or "" + content = strip_reasoning(message.content or "") tool_calls = _openai_tool_calls(message.tool_calls or []) return {"content": content, "tool_calls": tool_calls, "raw": response} @@ -1263,7 +1313,11 @@ async def _do_stream_completion(): logger.debug("Failed to parse tool arguments: %r", raw_args[:200]) args = {} tool_calls.append({"id": tc["id"], "name": tc["name"], "arguments": args}) - return {"content": "".join(content_parts), "tool_calls": tool_calls, "raw": None} + return { + "content": strip_reasoning("".join(content_parts)), + "tool_calls": tool_calls, + "raw": None, + } return await _retry_on_transient(_do_stream_completion)