10xHub · Iamsdt · May 22, 2026 · May 22, 2026 · May 22, 2026 · May 22, 2026
diff --git a/agentflow/core/graph/agent_internal/google.py b/agentflow/core/graph/agent_internal/google.py
@@ -304,9 +304,6 @@ def _build_google_config(
         structured_output = getattr(self, "output_schema", None) is not None
         text_like_output = self.output_type in ("text", "json")
 
-        if system_instruction:
-            config_kwargs["system_instruction"] = system_instruction
-
         if "temperature" in call_kwargs:
             config_kwargs["temperature"] = call_kwargs.pop("temperature")
         if "max_tokens" in call_kwargs or "max_output_tokens" in call_kwargs:
@@ -315,6 +312,13 @@ def _build_google_config(
                 call_kwargs.pop("max_output_tokens", None),
             )
 
+        cached_content = call_kwargs.pop("cached_content", None)
+        if cached_content:
+            # system_instruction is already inside the cache — don't resend it
+            config_kwargs["cached_content"] = cached_content
+        elif system_instruction:
+            config_kwargs["system_instruction"] = system_instruction
+
         if tools and text_like_output and not structured_output:
             function_declarations = self._convert_tools_to_google_format(tools)
             if function_declarations:
@@ -370,11 +374,17 @@ async def _call_google_content_generation(
             mode_suffix,
             self.model,
         )
-        return await self.client.aio.models.generate_content(
+        response = await self.client.aio.models.generate_content(
             model=self.model,
             contents=google_contents,
             config=config,
         )
+        cached = (
+            getattr(getattr(response, "usage_metadata", None), "cached_content_token_count", 0) or 0
+        )
+        if cached:
+            logger.debug("Cache hit: %d cached tokens (Google)", cached)
+        return response
 
     async def _call_google(
         self,
@@ -397,9 +407,24 @@ async def _call_google(
 
         call_kwargs = {**self.llm_kwargs, **kwargs}
 
+        # Peek before _build_google_config pops it, so we know whether a cache is active.
+        has_explicit_cache = bool(call_kwargs.get("cached_content"))
+
         system_instruction, google_contents = self._convert_to_google_format(messages)
         config = self._build_google_config(system_instruction, tools, call_kwargs)
 
+        # When an explicit cache is active, system_instruction is excluded from the
+        # config (the static part lives inside the cache).  Any dynamic additions
+        # — memory injections, skill prompts, per-request state — are preserved by
+        # prepending them as a leading user message so the model still sees them.
+        if has_explicit_cache and system_instruction:
+            from google.genai import types
+
+            google_contents = [
+                types.Content(role="user", parts=[types.Part(text=system_instruction)]),
+                *google_contents,
+            ]
+
         if structured_output:
             if config is None:
                 from google.genai import types

diff --git a/agentflow/core/graph/agent_internal/openai.py b/agentflow/core/graph/agent_internal/openai.py
@@ -101,25 +101,36 @@ async def _call_openai(
                 call_kwargs["tools"] = tools
 
             logger.debug("Calling OpenAI beta.chat.completions.parse with model=%s", self.model)
-            return await self.client.beta.chat.completions.parse(
+            response = await self.client.beta.chat.completions.parse(
                 model=self.model,
                 messages=messages,
                 response_format=output_schema,
                 stream=False,
                 **call_kwargs,
             )
+            details = getattr(getattr(response, "usage", None), "prompt_tokens_details", None)
+            cached = getattr(details, "cached_tokens", 0) or 0
+            if cached:
+                logger.debug("Cache hit: %d cached tokens (OpenAI chat completions)", cached)
+            return response
 
         if self.output_type in ("text", "json"):
             if tools:
                 call_kwargs["tools"] = tools
 
             logger.debug("Calling OpenAI chat.completions.create with model=%s", self.model)
-            return await self.client.chat.completions.create(
+            response = await self.client.chat.completions.create(
                 model=self.model,
                 messages=messages,
                 stream=stream,
                 **call_kwargs,
             )
+            if not stream:
+                details = getattr(getattr(response, "usage", None), "prompt_tokens_details", None)
+                cached = getattr(details, "cached_tokens", 0) or 0
+                if cached:
+                    logger.debug("Cache hit: %d cached tokens (OpenAI chat completions)", cached)
+            return response
 
         if self.output_type == "image":
             prompt = self._extract_prompt(messages)
@@ -223,15 +234,21 @@ async def _call_openai_responses(  # noqa: PLR0912
             call_kwargs["instructions"] = instructions
         if responses_tools:
             call_kwargs["tools"] = responses_tools
-        if self.reasoning_config:
+        if self.reasoning_config:  # type: ignore
             call_kwargs["reasoning"] = self.reasoning_config
 
         call_kwargs.pop("reasoning_effort", None)
 
         logger.debug("Calling OpenAI responses.create with model=%s", self.model)
-        return await self.client.responses.create(
+        response = await self.client.responses.create(
             model=self.model,
             input=input_items,
             stream=stream,
             **call_kwargs,
         )
+        if not stream:
+            details = getattr(getattr(response, "usage", None), "input_tokens_details", None)
+            cached = getattr(details, "cached_tokens", 0) or 0
+            if cached:
+                logger.debug("Cache hit: %d cached tokens (OpenAI responses API)", cached)
+        return response
diff --git a/agentflow/core/llm/__init__.py b/agentflow/core/llm/__init__.py
@@ -1,6 +1,7 @@
 """LLM client creation utilities shared across agents and evaluators."""
 
+from .caller import call_llm
 from .client_factory import create_llm_client, detect_provider
 
 
-__all__ = ["create_llm_client", "detect_provider"]
+__all__ = ["call_llm", "create_llm_client", "detect_provider"]