Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 29 additions & 4 deletions agentflow/core/graph/agent_internal/google.py
Original file line number Diff line number Diff line change
Expand Up @@ -304,9 +304,6 @@ def _build_google_config(
structured_output = getattr(self, "output_schema", None) is not None
text_like_output = self.output_type in ("text", "json")

if system_instruction:
config_kwargs["system_instruction"] = system_instruction

if "temperature" in call_kwargs:
config_kwargs["temperature"] = call_kwargs.pop("temperature")
if "max_tokens" in call_kwargs or "max_output_tokens" in call_kwargs:
Expand All @@ -315,6 +312,13 @@ def _build_google_config(
call_kwargs.pop("max_output_tokens", None),
)

cached_content = call_kwargs.pop("cached_content", None)
if cached_content:
# system_instruction is already inside the cache — don't resend it
config_kwargs["cached_content"] = cached_content
elif system_instruction:
config_kwargs["system_instruction"] = system_instruction

if tools and text_like_output and not structured_output:
function_declarations = self._convert_tools_to_google_format(tools)
if function_declarations:
Expand Down Expand Up @@ -370,11 +374,17 @@ async def _call_google_content_generation(
mode_suffix,
self.model,
)
return await self.client.aio.models.generate_content(
response = await self.client.aio.models.generate_content(
model=self.model,
contents=google_contents,
config=config,
)
cached = (
getattr(getattr(response, "usage_metadata", None), "cached_content_token_count", 0) or 0
)
if cached:
logger.debug("Cache hit: %d cached tokens (Google)", cached)
return response

async def _call_google(
self,
Expand All @@ -397,9 +407,24 @@ async def _call_google(

call_kwargs = {**self.llm_kwargs, **kwargs}

# Peek before _build_google_config pops it, so we know whether a cache is active.
has_explicit_cache = bool(call_kwargs.get("cached_content"))

system_instruction, google_contents = self._convert_to_google_format(messages)
config = self._build_google_config(system_instruction, tools, call_kwargs)

# When an explicit cache is active, system_instruction is excluded from the
# config (the static part lives inside the cache). Any dynamic additions
# — memory injections, skill prompts, per-request state — are preserved by
# prepending them as a leading user message so the model still sees them.
if has_explicit_cache and system_instruction:
from google.genai import types

google_contents = [
types.Content(role="user", parts=[types.Part(text=system_instruction)]),
*google_contents,
]

if structured_output:
if config is None:
from google.genai import types
Expand Down
25 changes: 21 additions & 4 deletions agentflow/core/graph/agent_internal/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,25 +101,36 @@ async def _call_openai(
call_kwargs["tools"] = tools

logger.debug("Calling OpenAI beta.chat.completions.parse with model=%s", self.model)
return await self.client.beta.chat.completions.parse(
response = await self.client.beta.chat.completions.parse(
model=self.model,
messages=messages,
response_format=output_schema,
stream=False,
**call_kwargs,
)
details = getattr(getattr(response, "usage", None), "prompt_tokens_details", None)
cached = getattr(details, "cached_tokens", 0) or 0
if cached:
logger.debug("Cache hit: %d cached tokens (OpenAI chat completions)", cached)
return response

if self.output_type in ("text", "json"):
if tools:
call_kwargs["tools"] = tools

logger.debug("Calling OpenAI chat.completions.create with model=%s", self.model)
return await self.client.chat.completions.create(
response = await self.client.chat.completions.create(
model=self.model,
messages=messages,
stream=stream,
**call_kwargs,
)
if not stream:
details = getattr(getattr(response, "usage", None), "prompt_tokens_details", None)
cached = getattr(details, "cached_tokens", 0) or 0
if cached:
logger.debug("Cache hit: %d cached tokens (OpenAI chat completions)", cached)
return response

if self.output_type == "image":
prompt = self._extract_prompt(messages)
Expand Down Expand Up @@ -223,15 +234,21 @@ async def _call_openai_responses( # noqa: PLR0912
call_kwargs["instructions"] = instructions
if responses_tools:
call_kwargs["tools"] = responses_tools
if self.reasoning_config:
if self.reasoning_config: # type: ignore
call_kwargs["reasoning"] = self.reasoning_config

call_kwargs.pop("reasoning_effort", None)

logger.debug("Calling OpenAI responses.create with model=%s", self.model)
return await self.client.responses.create(
response = await self.client.responses.create(
model=self.model,
input=input_items,
stream=stream,
**call_kwargs,
)
if not stream:
details = getattr(getattr(response, "usage", None), "input_tokens_details", None)
cached = getattr(details, "cached_tokens", 0) or 0
if cached:
logger.debug("Cache hit: %d cached tokens (OpenAI responses API)", cached)
return response
3 changes: 2 additions & 1 deletion agentflow/core/llm/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""LLM client creation utilities shared across agents and evaluators."""

from .caller import call_llm
from .client_factory import create_llm_client, detect_provider


__all__ = ["create_llm_client", "detect_provider"]
__all__ = ["call_llm", "create_llm_client", "detect_provider"]
Loading
Loading