Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions .env.public
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,23 @@ WATSONX_URL=https://us-south.ml.cloud.ibm.com # optional
# ── LiteLLM (plan-execute runner) ────────────────────────────────────────────
LITELLM_API_KEY=
LITELLM_BASE_URL=

# ── LLM generation parameters (all runners) ──────────────────────────────────
# Applied to every LLM call across plan-execute, claude-agent, openai-agent,
# deep-agent, and the FMSR server. All variables are optional; unset variables
# fall back to safe defaults shown in comments.
LLM_MAX_TOKENS= # int — max output tokens (default: 4096)
LLM_TEMPERATURE= # float — sampling temperature (default: 0.0)
LLM_TOP_P= # float — nucleus sampling top-p (default: omit)
LLM_REASONING_EFFORT= # none|low|medium|high|max (default: none)
# Controls extended thinking / reasoning depth.
# Mapped per provider: Claude uses effort+thinking,
# OpenAI uses Reasoning.effort (max→xhigh),
# LiteLLM passes reasoning_effort directly.
# Stripped with a warning on unsupported models
# (e.g. WatsonX Llama).
LLM_THINKING_BUDGET_TOKENS= # int — explicit thinking budget (default: omit)
# Only used for legacy Anthropic budget-style
# thinking (pre-4.6 models). Ignored on 4.6+.
LLM_STOP= # comma-separated stop sequences (default: omit)
# Example: LLM_STOP="</answer>,END"
8 changes: 7 additions & 1 deletion src/agent/claude_agent/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,10 @@

from observability import agent_run_span, persist_trajectory

from llm.generation import GenerationParams
from .._litellm import LITELLM_PREFIX, resolve_model
from .._prompts import AGENT_SYSTEM_PROMPT
from ..generation_maps import to_claude_agent_options
from ..models import AgentResult, ToolCall, Trajectory, TurnRecord
from ..runner import AgentRunner

Expand Down Expand Up @@ -95,9 +97,12 @@ def __init__(
model: str = _DEFAULT_MODEL,
max_turns: int = 30,
permission_mode: str = "bypassPermissions",
*,
generation: GenerationParams | None = None,
) -> None:
super().__init__(llm, server_paths)
super().__init__(llm, server_paths, generation=generation)
self._model = resolve_model(model)
self._model_id = model
self._sdk_env = _sdk_env(model)
self._max_turns = max_turns
self._permission_mode = permission_mode
Expand All @@ -123,6 +128,7 @@ async def run(self, question: str) -> AgentResult:
permission_mode=self._permission_mode,
env=self._sdk_env,
)
to_claude_agent_options(options, self._generation, self._model_id)

_log.info("ClaudeAgentRunner: starting query (model=%s)", self._model)
answer = ""
Expand Down
17 changes: 13 additions & 4 deletions src/agent/deep_agent/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,10 @@

from observability import agent_run_span, persist_trajectory

from llm.generation import GenerationParams
from .._litellm import LITELLM_PREFIX, resolve_model
from .._prompts import AGENT_SYSTEM_PROMPT
from ..generation_maps import to_chat_openai_kwargs
from ..models import AgentResult, ToolCall, Trajectory, TurnRecord
from ..runner import AgentRunner

Expand All @@ -39,14 +41,15 @@
_DEFAULT_MODEL = "litellm_proxy/aws/claude-opus-4-6"


def _build_chat_model(model_id: str):
def _build_chat_model(model_id: str, extra_kwargs: dict | None = None):
"""Construct a LangChain chat model for *model_id*.

When the ID uses the ``litellm_proxy/`` prefix, a :class:`ChatOpenAI`
instance is pointed at the LiteLLM proxy (using ``LITELLM_BASE_URL`` and
``LITELLM_API_KEY``). Otherwise the model string is passed to
``init_chat_model`` so any provider supported by LangChain can be used.
"""
extra_kwargs = extra_kwargs or {}
if model_id.startswith(LITELLM_PREFIX):
base_url = os.environ.get("LITELLM_BASE_URL")
api_key = os.environ.get("LITELLM_API_KEY")
Expand All @@ -61,11 +64,12 @@ def _build_chat_model(model_id: str):
model=resolve_model(model_id),
base_url=base_url,
api_key=api_key,
**extra_kwargs,
)

from langchain.chat_models import init_chat_model

return init_chat_model(model_id)
return init_chat_model(model_id, **extra_kwargs)


def _build_mcp_connections(
Expand Down Expand Up @@ -168,15 +172,20 @@ def __init__(
server_paths: dict[str, Path | str] | None = None,
model: str = _DEFAULT_MODEL,
recursion_limit: int = 100,
*,
generation: GenerationParams | None = None,
) -> None:
super().__init__(llm, server_paths)
super().__init__(llm, server_paths, generation=generation)
self._model_id = model
self._recursion_limit = recursion_limit

@cached_property
def _chat_model(self):
"""LangChain chat model, built once per runner instance."""
return _build_chat_model(self._model_id)
return _build_chat_model(
self._model_id,
to_chat_openai_kwargs(self._generation, self._model_id),
)

async def run(self, question: str) -> AgentResult:
"""Run the deep-agents loop for *question*.
Expand Down
185 changes: 185 additions & 0 deletions src/agent/generation_maps.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
"""Maps :class:`~llm.GenerationParams` to each agent SDK's native config types.

Import chain is intentionally one-way:
llm.generation ← (no agent deps)
agent.generation_maps → imports from SDK packages only when called

Three public helpers, one per SDK:

to_claude_agent_options(options, params, model_id)
Applies reasoning / thinking + strips-and-warns other params.

to_model_settings(params, model_id) -> ModelSettings
Returns an openai-agents ModelSettings.

to_chat_openai_kwargs(params, model_id) -> dict
Returns init / bind kwargs for langchain_openai.ChatOpenAI.
"""

from __future__ import annotations

import logging

from llm.generation import (
GenerationParams,
EFFORT_TO_OPENAI,
reasoning_supported,
)

_log = logging.getLogger(__name__)


# ── Claude Agent SDK ──────────────────────────────────────────────────────────


def to_claude_agent_options(
options, # claude_agent_sdk.ClaudeAgentOptions (avoid hard import at module level)
params: GenerationParams,
model_id: str,
) -> None:
"""Mutate *options* in-place with generation params.

Claude Agent SDK fields handled natively:
- ``effort`` → ``options.effort``
- ``thinking`` → ``options.thinking``

All other params (max_tokens, temperature, top_p, stop) are forwarded via
``options.extra_args`` if non-default, with a warning that support depends
on the underlying CLI version.
"""
effort = params.reasoning_effort

if reasoning_supported(model_id):
if effort == "none":
from claude_agent_sdk.types import ThinkingConfigDisabled

options.thinking = ThinkingConfigDisabled(type="disabled")
options.effort = None
elif params.thinking_budget_tokens is not None:
from claude_agent_sdk.types import ThinkingConfigEnabled

options.thinking = ThinkingConfigEnabled(
type="enabled",
budget_tokens=params.thinking_budget_tokens,
)
options.effort = None
else:
from claude_agent_sdk.types import ThinkingConfigAdaptive

options.thinking = ThinkingConfigAdaptive(type="adaptive")
options.effort = effort # type: ignore[assignment]
elif effort != "none":
_log.warning(
"reasoning_effort=%r requested but model %r does not support "
"reasoning on claude-agent — stripping thinking kwargs.",
effort,
model_id,
)

extra: dict[str, str | None] = dict(options.extra_args or {})

# max_tokens via extra_args (CLI flag name; strip+warn if unsupported at runtime)
if params.max_tokens != GenerationParams.max_tokens:
_log.warning(
"claude-agent: max_tokens=%d forwarded via extra_args; "
"support depends on the installed Claude Code CLI version.",
params.max_tokens,
)
extra["max-tokens"] = str(params.max_tokens)

if params.temperature != GenerationParams.temperature:
_log.warning(
"claude-agent: temperature=%.3g — ClaudeAgentOptions has no "
"native temperature field; stripping.",
params.temperature,
)

if params.top_p is not None:
_log.warning(
"claude-agent: top_p=%.3g — ClaudeAgentOptions has no native "
"top_p field; stripping.",
params.top_p,
)

if params.stop:
_log.warning(
"claude-agent: stop sequences — ClaudeAgentOptions has no native "
"stop field; stripping.",
)

options.extra_args = extra


# ── OpenAI Agents SDK ─────────────────────────────────────────────────────────


def to_model_settings(params: GenerationParams, model_id: str):
"""Return an ``agents.ModelSettings`` populated from *params*.

``reasoning_effort`` is mapped to ``ModelSettings.reasoning`` with the
OpenAI-compatible vocab (``max`` → ``xhigh``). Unsupported models get the
reasoning field stripped with a warning.
"""
from agents import ModelSettings
from openai.types.shared import Reasoning

kwargs: dict = {
"max_tokens": params.max_tokens,
"temperature": params.temperature,
}

if params.top_p is not None:
kwargs["top_p"] = params.top_p

if params.stop:
kwargs["extra_args"] = {"stop": list(params.stop)}

effort = params.reasoning_effort
if effort != "none":
if reasoning_supported(model_id):
openai_effort = EFFORT_TO_OPENAI[effort]
kwargs["reasoning"] = Reasoning(effort=openai_effort) # type: ignore[arg-type]
else:
_log.warning(
"reasoning_effort=%r requested but model %r does not support "
"reasoning on openai-agent — stripping.",
effort,
model_id,
)

return ModelSettings(**kwargs)


# ── LangChain ChatOpenAI (deep-agent) ─────────────────────────────────────────


def to_chat_openai_kwargs(params: GenerationParams, model_id: str) -> dict:
"""Return init / ``.bind()`` kwargs for ``langchain_openai.ChatOpenAI``.

Passes generation params through ``model_kwargs`` so the LiteLLM proxy
(which presents an OpenAI-compatible interface) forwards them correctly.
"""
kwargs: dict = {
"max_tokens": params.max_tokens,
"temperature": params.temperature,
}

if params.top_p is not None:
kwargs["top_p"] = params.top_p

if params.stop:
kwargs["stop"] = list(params.stop)

effort = params.reasoning_effort
if effort != "none":
if reasoning_supported(model_id):
kwargs["reasoning_effort"] = EFFORT_TO_OPENAI[effort]
else:
_log.warning(
"reasoning_effort=%r requested but model %r does not support "
"reasoning on deep-agent — stripping.",
effort,
model_id,
)

return kwargs
7 changes: 6 additions & 1 deletion src/agent/openai_agent/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,10 @@

from observability import agent_run_span, persist_trajectory

from llm.generation import GenerationParams
from .._litellm import LITELLM_PREFIX, resolve_model
from .._prompts import AGENT_SYSTEM_PROMPT
from ..generation_maps import to_model_settings
from ..models import AgentResult, ToolCall, Trajectory, TurnRecord
from ..runner import AgentRunner

Expand Down Expand Up @@ -193,8 +195,10 @@ def __init__(
server_paths: dict[str, Path | str] | None = None,
model: str = _DEFAULT_MODEL,
max_turns: int = 30,
*,
generation: GenerationParams | None = None,
) -> None:
super().__init__(llm, server_paths)
super().__init__(llm, server_paths, generation=generation)
self._model_id = model
self._model = resolve_model(model)
self._run_config = _build_run_config(model)
Expand Down Expand Up @@ -227,6 +231,7 @@ async def run(self, question: str) -> AgentResult:
instructions=AGENT_SYSTEM_PROMPT,
mcp_servers=active_servers,
model=self._model,
model_settings=to_model_settings(self._generation, self._model_id),
)

_log.info(
Expand Down
24 changes: 19 additions & 5 deletions src/agent/plan_execute/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from pathlib import Path

from llm import LLMBackend, LLMResult
from llm.generation import GenerationParams
from observability import agent_run_span, persist_trajectory

from .executor import Executor
Expand Down Expand Up @@ -44,16 +45,26 @@ def reset(self) -> None:
self.input_tokens = 0
self.output_tokens = 0

def generate(self, prompt: str, temperature: float = 0.0) -> str:
result = self._inner.generate_with_usage(prompt, temperature)
def generate(
self,
prompt: str,
temperature: float = 0.0,
*,
params: GenerationParams | None = None,
) -> str:
result = self._inner.generate_with_usage(prompt, temperature, params=params)
self.input_tokens += result.input_tokens
self.output_tokens += result.output_tokens
return result.text

def generate_with_usage(
self, prompt: str, temperature: float = 0.0
self,
prompt: str,
temperature: float = 0.0,
*,
params: GenerationParams | None = None,
) -> LLMResult:
result = self._inner.generate_with_usage(prompt, temperature)
result = self._inner.generate_with_usage(prompt, temperature, params=params)
self.input_tokens += result.input_tokens
self.output_tokens += result.output_tokens
return result
Expand All @@ -62,6 +73,7 @@ def generate_with_usage(
def model_id(self) -> str:
return self._inner.model_id


_log = logging.getLogger(__name__)

_SUMMARIZE_PROMPT = """\
Expand Down Expand Up @@ -102,8 +114,10 @@ def __init__(
self,
llm: LLMBackend,
server_paths: dict[str, Path | str] | None = None,
*,
generation: GenerationParams | None = None,
) -> None:
super().__init__(llm, server_paths)
super().__init__(llm, server_paths, generation=generation)
self._meter = _TokenMeter(llm)
self._planner = Planner(self._meter)
self._executor = Executor(self._meter, server_paths)
Expand Down
Loading