From e5ac134d8411d3c51f20ea548ff49282a91d3442 Mon Sep 17 00:00:00 2001
From: Katrina <katrina.stankiewicz@servicenow.com>
Date: Wed, 25 Mar 2026 20:28:00 -0400
Subject: [PATCH 01/25] add support for openai realtime model. Add vad tracking
 for realtime models (using external vad)

---
 .env.example                               |  12 +--
 src/eva/assistant/pipeline/observers.py    |   3 +-
 src/eva/assistant/pipeline/realtime_llm.py |  94 ++++++++++++++++-
 src/eva/assistant/pipeline/services.py     | 116 ++++++++++++++-------
 src/eva/assistant/server.py                |  31 +++++-
 src/eva/models/config.py                   |  13 ++-
 src/eva/utils/prompt_manager.py            |   2 +-
 7 files changed, 212 insertions(+), 59 deletions(-)

diff --git a/.env.example b/.env.example
index 061dd906..7398c5d0 100644
--- a/.env.example
+++ b/.env.example
@@ -167,20 +167,16 @@ EVA_MODEL__LLM=gpt-5.2
 # GOOGLE_API_KEY=your_google_api_key_here
 
 # ==============================================
-# Optional: Realtime / Audio-LLM Configuration
+# Optional: Speech-to-Speech / Audio-LLM Configuration
 # ==============================================
-# Only needed if benchmarking speech-to-speech or realtime models.
+# Only needed if benchmarking speech-to-speech models.
 
-# EVA_MODEL__REALTIME_MODEL=gpt-realtime-mini
-# EVA_MODEL__REALTIME_MODEL_PARAMS='{"voice":"marin"}'
+# EVA_MODEL__S2S=openai
+# EVA_MODEL__S2S_PARAMS='{"model": "gpt-realtime-mini", "voice": "marin"}'
 
 # EVA_MODEL__AUDIO_LLM=
 # EVA_MODEL__AUDIO_LLM_PARAMS='{"url": "", "api_key": ""}'
 
-# Azure Realtime credentials (if using Azure realtime models)
-# AZURE_OPENAI_REALTIME_API_KEY=
-# AZURE_OPENAI_REALTIME_ENDPOINT=
-
 # ==============================================
 # Optional: Execution Settings
 # ==============================================
diff --git a/src/eva/assistant/pipeline/observers.py b/src/eva/assistant/pipeline/observers.py
index df1a50d5..a3755d48 100644
--- a/src/eva/assistant/pipeline/observers.py
+++ b/src/eva/assistant/pipeline/observers.py
@@ -22,6 +22,7 @@
 from pipecat.observers.turn_tracking_observer import TurnTrackingObserver
 from pipecat.services.azure.realtime.llm import AzureRealtimeLLMService
 from pipecat.services.llm_service import LLMService
+from pipecat.services.openai.realtime.llm import OpenAIRealtimeLLMService
 from pipecat.services.stt_service import STTService
 from pipecat.services.tts_service import TTSService
 
@@ -31,7 +32,7 @@
 logger = get_logger(__name__)
 
 
-_TRANSCRIPTION_SERVICES = (STTService, AzureRealtimeLLMService)
+_TRANSCRIPTION_SERVICES = (STTService, AzureRealtimeLLMService, OpenAIRealtimeLLMService)
 
 
 class WallClock(SystemClock):
diff --git a/src/eva/assistant/pipeline/realtime_llm.py b/src/eva/assistant/pipeline/realtime_llm.py
index 7d30bac2..b502b4df 100644
--- a/src/eva/assistant/pipeline/realtime_llm.py
+++ b/src/eva/assistant/pipeline/realtime_llm.py
@@ -1,6 +1,6 @@
 """Instrumented realtime LLM service for correct audit log ordering and timestamps.
 
-Subclasses AzureRealtimeLLMService to intercept raw OpenAI Realtime API events
+Subclasses OpenAIRealtimeLLMService to intercept raw OpenAI Realtime API events
 (speech_started, speech_stopped, transcription.completed, response.done) which
 have a guaranteed ordering and carry item_id for correlation.
 
@@ -11,17 +11,24 @@
 Writing user entries on #3 and assistant entries on #5 guarantees correct order.
 """
 
+import struct
 import time
 from dataclasses import dataclass
 from typing import Any, Optional
 
-from pipecat.services.azure.realtime.llm import AzureRealtimeLLMService
+from pipecat.frames.frames import Frame, InputAudioRawFrame, VADUserStartedSpeakingFrame, VADUserStoppedSpeakingFrame
+from pipecat.processors.frame_processor import FrameDirection
+from pipecat.services.openai.realtime.llm import OpenAIRealtimeLLMService
 
 from eva.assistant.agentic.audit_log import AuditLog
 from eva.utils.logging import get_logger
 
 logger = get_logger(__name__)
 
+# Audio threshold for detecting speech vs silence
+# RMS values below this are considered silence
+SILENCE_RMS_THRESHOLD = 10
+
 
 @dataclass
 class _UserTurnRecord:
@@ -39,8 +46,20 @@ def _wall_ms() -> str:
     return str(int(round(time.time() * 1000)))
 
 
-class InstrumentedRealtimeLLMService(AzureRealtimeLLMService):
-    """AzureRealtimeLLMService subclass that writes audit log entries with correct ordering and wall-clock timestamps derived from Realtime API events.
+def _calculate_rms(audio_bytes: bytes) -> float:
+    """Calculate RMS (root mean square) energy of 16-bit PCM audio."""
+    if len(audio_bytes) < 2:
+        return 0.0
+    num_samples = len(audio_bytes) // 2
+    samples = struct.unpack(f"<{num_samples}h", audio_bytes[: num_samples * 2])
+    if not samples:
+        return 0.0
+    sum_squares = sum(s * s for s in samples)
+    return (sum_squares / len(samples)) ** 0.5
+
+
+class InstrumentedRealtimeLLMService(OpenAIRealtimeLLMService):
+    """OpenAIRealtimeLLMService subclass that writes audit log entries with correct ordering and wall-clock timestamps derived from Realtime API events.
 
     All overridden methods call ``super()`` first so that the parent's frame
     processing (audio playback, interruption handling, metrics, etc.) is fully
@@ -61,12 +80,35 @@ def __init__(self, *, audit_log: AuditLog, **kwargs: Any) -> None:
         # Track whether we're mid-assistant-response (for interruption flushing)
         self._assistant_responding: bool = False
 
+        # Track audio frame timing for VAD delay calculation
+        self._last_audio_frame_time: Optional[float] = None
+        self._vad_delay_ms: Optional[int] = None
+
+    async def process_frame(self, frame: Frame, direction: FrameDirection) -> None:
+        """Track audio frame timing before passing to parent.
+
+        Only updates the timestamp when audio has actual speech content (not silence),
+        so VAD delay calculation reflects when user actually stopped speaking.
+        """
+        if isinstance(frame, InputAudioRawFrame):
+            rms = _calculate_rms(frame.audio)
+            if rms > SILENCE_RMS_THRESHOLD:
+                self._last_audio_frame_time = time.time()
+
+        await super().process_frame(frame, direction)
+
     async def _handle_evt_speech_started(self, evt: Any) -> None:
         """Fires when user starts speaking (input_audio_buffer.speech_started).
 
         Captures wall-clock start time.  Also flushes any in-progress
         interrupted assistant response before recording the new user turn.
         """
+        # Reset VAD tracking for new turn
+        self._vad_delay_ms = None
+
+        # Broadcast VAD user started speaking frame because realtime VAD does not broadcast it themselves
+        await self.broadcast_frame(VADUserStartedSpeakingFrame)
+
         # Flush interrupted assistant response if one is in progress
         if self._assistant_responding and self._current_assistant_transcript_parts:
             partial_text = "".join(self._current_assistant_transcript_parts) + " [interrupted]"
@@ -92,8 +134,21 @@ async def _handle_evt_speech_started(self, evt: Any) -> None:
     async def _handle_evt_speech_stopped(self, evt: Any) -> None:
         """Fires when user stops speaking (input_audio_buffer.speech_stopped).
 
-        Captures wall-clock end time for the user turn.
+        Captures wall-clock end time for the user turn and calculates VAD delay.
         """
+        speech_stopped_time = time.time()
+
+        # Calculate VAD delay: time between last audio frame and speech_stopped event
+        if self._last_audio_frame_time is not None:
+            self._vad_delay_ms = int((speech_stopped_time - self._last_audio_frame_time) * 1000)
+        else:
+            logger.warning("speech_stopped fired but no audio frames were tracked")
+            self._vad_delay_ms = None
+
+        # Reset audio tracking for next turn
+        self._last_audio_frame_time = None
+
+        await self.broadcast_frame(VADUserStoppedSpeakingFrame)
         await super()._handle_evt_speech_stopped(evt)
 
         item_id = getattr(evt, "item_id", None) or ""
@@ -145,6 +200,7 @@ async def _handle_evt_audio_delta(self, evt: Any) -> None:
         """Fires for each audio chunk of the assistant response.
 
         Captures wall-clock of the *first* delta as assistant response start.
+        Also logs the full user-perceived response latency including VAD delay.
         """
         await super()._handle_evt_audio_delta(evt)
 
@@ -152,6 +208,24 @@ async def _handle_evt_audio_delta(self, evt: Any) -> None:
             self._assistant_response_start_wall_ms = _wall_ms()
             self._assistant_responding = True
 
+            # Log full user-perceived latency (includes VAD delay)
+            if self._vad_delay_ms is not None:
+                # Find the most recent user turn to get speech_stopped time
+                recent_record = None
+                for record in self._user_turns.values():
+                    if record.speech_stopped_wall_ms:
+                        recent_record = record
+
+                if recent_record and recent_record.speech_stopped_wall_ms:
+                    speech_stopped_ms = int(recent_record.speech_stopped_wall_ms)
+                    response_start_ms = int(self._assistant_response_start_wall_ms)
+                    vad_to_response_ms = response_start_ms - speech_stopped_ms
+                    full_latency_ms = vad_to_response_ms + self._vad_delay_ms
+                    logger.debug(
+                        f"Full response latency: {full_latency_ms}ms "
+                        f"(VAD delay: {self._vad_delay_ms}ms + response: {vad_to_response_ms}ms)"
+                    )
+
     async def _handle_evt_audio_transcript_delta(self, evt: Any) -> None:
         """Fires for incremental assistant transcript text.
 
@@ -220,6 +294,16 @@ def _reset_assistant_state(self) -> None:
         self._assistant_response_start_wall_ms = None
         self._assistant_responding = False
 
+    @property
+    def last_vad_delay_ms(self) -> Optional[int]:
+        """Return the most recent VAD delay in milliseconds.
+
+        This is the time between when audio frames stopped arriving and when
+        OpenAI's VAD detected end of speech. Can be used to adjust response
+        latency measurements to reflect user-perceived latency.
+        """
+        return self._vad_delay_ms
+
     @staticmethod
     def _response_has_function_calls(evt: Any) -> bool:
         """Return True if the response.done event contains any function_call outputs."""
diff --git a/src/eva/assistant/pipeline/services.py b/src/eva/assistant/pipeline/services.py
index da83b77b..e4ce760c 100644
--- a/src/eva/assistant/pipeline/services.py
+++ b/src/eva/assistant/pipeline/services.py
@@ -20,7 +20,6 @@
     AssemblyAIConnectionParams,
     AssemblyAISTTService,
 )
-from pipecat.services.azure.realtime.llm import AzureRealtimeLLMService
 from pipecat.services.cartesia.stt import CartesiaLiveOptions, CartesiaSTTService
 from pipecat.services.cartesia.tts import CartesiaTTSService
 from pipecat.services.deepgram.flux.stt import DeepgramFluxSTTService
@@ -36,12 +35,14 @@
     SemanticTurnDetection,
     SessionProperties,
 )
+from pipecat.services.openai.realtime.llm import OpenAIRealtimeLLMService
 from pipecat.services.openai.stt import OpenAISTTService
 from pipecat.services.openai.tts import VALID_VOICES, OpenAITTSService
 from pipecat.services.stt_service import STTService
 from pipecat.services.tts_service import TTSService
 from pipecat.transcriptions.language import Language
 from pipecat.utils.text.base_text_filter import BaseTextFilter
+from websockets.asyncio.client import connect as websocket_connect
 
 from eva.assistant.pipeline.alm_vllm import ALMvLLMClient
 from eva.assistant.pipeline.nvidia_baseten import BasetenSTTService, BasetenTTSService
@@ -371,6 +372,15 @@ def create_realtime_llm_service(
     """
     model_lower = (model or "").lower()
 
+    # Get realtime server prompt
+    prompt_manager = PromptManager()
+    system_prompt = prompt_manager.get_prompt(
+        "realtime_agent.system_prompt",
+        agent_personality=agent.description,
+        agent_instructions=agent.instructions,
+        datetime=current_date_time,
+    )
+
     openai_tools = agent.build_tools_for_realtime() if agent else None
 
     # Convert OpenAI format tools to pipecat format
@@ -390,62 +400,70 @@ def create_realtime_llm_service(
                 )
         pipecat_tools = ToolsSchema(standard_tools=function_schemas)
 
-    # Get realtime server prompt
-    prompt_manager = PromptManager()
-    system_prompt = prompt_manager.get_prompt(
-        "realtime_agent.system_prompt",
-        agent_personality=agent.description,
-        agent_instructions=agent.instructions,
-        datetime=current_date_time,
+    session_properties = SessionProperties(
+        instructions=system_prompt,
+        audio=AudioConfiguration(
+            input=AudioInput(
+                transcription=InputAudioTranscription(
+                    model=params.get("transcription_model", "gpt-4o-mini-transcribe")
+                ),
+                # Set openai TurnDetection parameters. Not setting this at all will turn it on by default
+                turn_detection=SemanticTurnDetection(),
+            ),
+            output=AudioOutput(
+                voice=params.get("voice", "marin"),
+            ),
+        ),
+        tools=pipecat_tools,
+        tool_choice="auto",
     )
 
-    if model_lower.startswith("gpt-realtime"):
+    if model_lower.startswith("openai"):
+        if audit_log is not None:
+            logger.info(
+                f"Using InstrumentedRealtimeLLMService for audit log interception: openai: {params.get('model')}"
+            )
+            return InstrumentedRealtimeLLMService(
+                model=params.get("model"),
+                audit_log=audit_log,
+                api_key=params.get("api_key") or os.getenv("OPENAI_API_KEY"),
+                session_properties=session_properties,
+            )
+
+        return OpenAIRealtimeLLMService(
+            api_key=params.get("api_key"),
+            session_properties=session_properties,
+        )
+    elif model_lower.startswith("azure") or model_lower.startswith("gpt-realtime"):
         #
-        # base_url =The full Azure WebSocket endpoint URL including api-version and deployment.
+        # base_url: The full Azure WebSocket endpoint URL including api-version and deployment.
         # Example: "wss://my-project.openai.azure.com/openai/v1/realtime"
-        url = os.environ.get("AZURE_OPENAI_REALTIME_ENDPOINT", "")
-        url += f"?model={model_lower}"
-
-        session_properties = SessionProperties(
-            instructions=system_prompt,
-            audio=AudioConfiguration(
-                input=AudioInput(
-                    transcription=InputAudioTranscription(model="whisper-1"),
-                    # Set openai TurnDetection parameters. Not setting this at all will turn it
-                    # on by default
-                    turn_detection=SemanticTurnDetection(),
-                    # Or set to False to disable openai turn detection and use transport VAD
-                    # turn_detection=False,
-                    # noise_reduction=InputAudioNoiseReduction(type="near_field"),
-                ),
-                output=AudioOutput(
-                    voice=params.get("voice", "marin"),
-                ),
-            ),
-            tools=pipecat_tools,
-            tool_choice="auto",
-        )
-        logger.info(f"Using Azure Realtime LLM: {model_lower}")
+        url = params.get("url", "")
+
+        logger.info(f"Using Azure Realtime LLM: {model_lower}, url {url}")
 
         if audit_log is not None:
             logger.info("Using InstrumentedRealtimeLLMService for audit log interception")
-            return InstrumentedRealtimeLLMService(
-                model=model_lower,
+            service = InstrumentedRealtimeLLMService(
+                model=params.get("model"),
                 audit_log=audit_log,
-                api_key=os.environ.get("AZURE_OPENAI_REALTIME_API_KEY"),
+                api_key=params.get("api_key"),
                 base_url=url,
                 session_properties=session_properties,
             )
+            InstrumentedRealtimeLLMService._connect = override__connect  # azure realtime connect
+            return service
 
-        return AzureRealtimeLLMService(
-            api_key=os.environ.get("AZURE_OPENAI_REALTIME_API_KEY"),
+        return OpenAIRealtimeLLMService(
+            api_key=params.get("api_key"),
             base_url=url,
             session_properties=session_properties,
         )
     elif model_lower == "ultravox":
+        logger.info("Using Ultravox LLM")
         return UltravoxRealtimeLLMService(
             params=OneShotInputParams(
-                api_key=os.getenv("ULTRAVOX_API_KEY"),
+                api_key=params.get("api_key"),
                 system_prompt=system_prompt,
                 temperature=0.3,
                 max_duration=datetime.timedelta(minutes=6),
@@ -563,6 +581,26 @@ async def override_run_tts(self, text: str, context_id: str) -> AsyncGenerator[F
         yield ErrorFrame(error=f"Unknown error occurred: {e}")
 
 
+async def override__connect(self):
+    try:
+        if self._websocket:
+            # Here we assume that if we have a websocket, we are connected. We
+            # handle disconnections in the send/recv code paths.
+            return
+
+        logger.info(f"Connecting to {self.base_url}")
+        self._websocket = await websocket_connect(
+            uri=self.base_url,
+            additional_headers={
+                "api-key": self.api_key,
+            },
+        )
+        self._receive_task = self.create_task(self._receive_task_handler())
+    except Exception as e:
+        await self.push_error(error_msg=f"initialization error: {e}", exception=e)
+        self._websocket = None
+
+
 # Unicode to ASCII replacements for TTS
 _TTS_CHAR_MAP = str.maketrans(
     {
diff --git a/src/eva/assistant/server.py b/src/eva/assistant/server.py
index 57a0fc2e..4282e894 100644
--- a/src/eva/assistant/server.py
+++ b/src/eva/assistant/server.py
@@ -326,7 +326,10 @@ async def _realtime_tool_handler(params) -> None:
                     "smart_turn_stop_secs", 0.8
                 )  # Shorter silence so we don't have to wait 3s if smart turn marks audio as incomplete
 
-            if isinstance(self.pipeline_config, PipelineConfig) and self.pipeline_config.turn_strategy == "external":
+            if (
+                isinstance(self.pipeline_config, (PipelineConfig, SpeechToSpeechConfig))
+                and self.pipeline_config.turn_strategy == "external"
+            ):
                 logger.info("Using external user turn strategies")
                 user_turn_strategies = ExternalUserTurnStrategies()
                 vad_analyzer = None
@@ -444,9 +447,29 @@ async def on_user_transcription(text: str, timestamp: str, turn_id: int | None)
             self._latency_measurements = []
 
             async def on_latency_measured(observer, latency_seconds: float):
-                """Event handler for UserBotLatencyObserver - stores latency measurements."""
-                self._latency_measurements.append(latency_seconds)
-                logger.debug(f"Response latency captured: {latency_seconds:.3f}s")
+                """Event handler for UserBotLatencyObserver - stores latency measurements.
+
+                For realtime LLM, adds VAD delay to get full user-perceived latency.
+                For pipecat VAD (non-realtime), uses the latency as-is.
+                """
+                adjusted_latency = latency_seconds
+
+                # Add VAD delay for realtime LLM to get full user-perceived latency
+                if isinstance(realtime_llm, InstrumentedRealtimeLLMService):
+                    vad_delay_ms = realtime_llm.last_vad_delay_ms
+                    if vad_delay_ms is not None:
+                        vad_delay_s = vad_delay_ms / 1000.0
+                        adjusted_latency = latency_seconds + vad_delay_s
+                        logger.debug(
+                            f"Response latency captured: {adjusted_latency:.3f}s "
+                            f"(VAD delay: {vad_delay_s:.3f}s + pipecat: {latency_seconds:.3f}s)"
+                        )
+                    else:
+                        logger.debug(f"Response latency captured: {latency_seconds:.3f}s (no VAD delay available)")
+                else:
+                    logger.debug(f"Response latency captured: {latency_seconds:.3f}s")
+
+                self._latency_measurements.append(adjusted_latency)
 
             user_bot_observer = UserBotLatencyObserver()
             user_bot_observer.add_event_handler("on_latency_measured", on_latency_measured)
diff --git a/src/eva/models/config.py b/src/eva/models/config.py
index cd8fe819..99b706b6 100644
--- a/src/eva/models/config.py
+++ b/src/eva/models/config.py
@@ -97,6 +97,17 @@ class SpeechToSpeechConfig(BaseModel):
     s2s: str = Field(description="Speech-to-speech model name", examples=["gpt-realtime-mini", "gemini_live"])
     s2s_params: dict[str, Any] = Field({}, description="Additional speech-to-speech model parameters (JSON)")
 
+    turn_strategy: Literal["smart", "external"] = Field(
+        "smart",
+        description=(
+            "User turn detection strategy. "
+            "'smart' uses LocalSmartTurnAnalyzerV3 + SileroVAD (default). "
+            "'external' uses ExternalUserTurnStrategies for services with built-in turn detection "
+            "(e.g., deepgram-flux, Speechmatics). "
+            "Set via EVA_MODEL__TURN_STRATEGY=external."
+        ),
+    )
+
 
 class AudioLLMConfig(BaseModel):
     """Configuration for an Audio-LLM pipeline (audio in, text out, separate TTS).
@@ -129,7 +140,7 @@ class AudioLLMConfig(BaseModel):
     *PipelineConfig._LEGACY_RENAMES,
     *PipelineConfig._LEGACY_DROP,
 }
-_S2S_FIELDS = {"s2s", "s2s_params"}
+_S2S_FIELDS = {"s2s", "s2s_params", "turn_strategy"}
 _AUDIO_LLM_FIELDS = {"audio_llm", "audio_llm_params", "tts", "tts_params"}
 
 
diff --git a/src/eva/utils/prompt_manager.py b/src/eva/utils/prompt_manager.py
index 2216fddc..56971149 100644
--- a/src/eva/utils/prompt_manager.py
+++ b/src/eva/utils/prompt_manager.py
@@ -121,7 +121,7 @@ def get_prompt(self, path: str, **variables) -> str:
             return value.format(**formatted_vars)
         except KeyError as e:
             raise KeyError(
-                "Missing variable {e} for prompt '{path}'. Available variables: {sorted(formatted_vars.keys())}"
+                f"Missing variable {e} for prompt '{path}'. Available variables: {sorted(formatted_vars.keys())}"
             ) from e
 
 

From 227e008903318fa2e365f0172ace3a7790c892c2 Mon Sep 17 00:00:00 2001
From: Katrina <katrina.stankiewicz@servicenow.com>
Date: Wed, 25 Mar 2026 20:31:34 -0400
Subject: [PATCH 02/25] add api_key to s2s params example

---
 .env.example | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.env.example b/.env.example
index 7398c5d0..4a434630 100644
--- a/.env.example
+++ b/.env.example
@@ -172,7 +172,7 @@ EVA_MODEL__LLM=gpt-5.2
 # Only needed if benchmarking speech-to-speech models.
 
 # EVA_MODEL__S2S=openai
-# EVA_MODEL__S2S_PARAMS='{"model": "gpt-realtime-mini", "voice": "marin"}'
+# EVA_MODEL__S2S_PARAMS='{"model": "gpt-realtime-mini", "api_key": ""}'
 
 # EVA_MODEL__AUDIO_LLM=
 # EVA_MODEL__AUDIO_LLM_PARAMS='{"url": "", "api_key": ""}'

From 59ac784c262cefa0316f8293a124055a6b601cd2 Mon Sep 17 00:00:00 2001
From: Katrina <katrina.stankiewicz@servicenow.com>
Date: Wed, 25 Mar 2026 20:33:03 -0400
Subject: [PATCH 03/25] add comment

---
 src/eva/assistant/pipeline/services.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/eva/assistant/pipeline/services.py b/src/eva/assistant/pipeline/services.py
index e4ce760c..9cfb9f5c 100644
--- a/src/eva/assistant/pipeline/services.py
+++ b/src/eva/assistant/pipeline/services.py
@@ -582,6 +582,7 @@ async def override_run_tts(self, text: str, context_id: str) -> AsyncGenerator[F
 
 
 async def override__connect(self):
+    # Allow connections to azure / other providers using a base_url
     try:
         if self._websocket:
             # Here we assume that if we have a websocket, we are connected. We

From e4f81c4d65abadc1c4fccacec37033938b292342 Mon Sep 17 00:00:00 2001
From: Katrina <katrina.stankiewicz@servicenow.com>
Date: Thu, 26 Mar 2026 11:06:23 -0400
Subject: [PATCH 04/25] force riva client 2.25.0

---
 pyproject.toml | 2 +-
 uv.lock        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index ec59536f..75a7d6b5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -54,7 +54,7 @@ dependencies = [
     "jaconv>=0.3.0",
     "regex>=2023.0.0",
     "more-itertools>=10.0.0",
-    "nvidia-riva-client>=2.25.0,<3.0.0"
+    "nvidia-riva-client>=2.25.0,<2.25.1"
 ]
 
 [project.optional-dependencies]
diff --git a/uv.lock b/uv.lock
index b7c8efec..a291ea54 100644
--- a/uv.lock
+++ b/uv.lock
@@ -827,7 +827,7 @@ requires-dist = [
     { name = "more-itertools", specifier = ">=10.0.0" },
     { name = "mypy", marker = "extra == 'dev'", specifier = ">=1.5" },
     { name = "numpy", specifier = ">=1.24" },
-    { name = "nvidia-riva-client", specifier = ">=2.25.0,<3.0.0" },
+    { name = "nvidia-riva-client", specifier = ">=2.25.0,<2.25.1" },
     { name = "onnxruntime", specifier = ">=1.16.0" },
     { name = "openai", specifier = ">=1.0.0" },
     { name = "pandas", specifier = ">=2.0" },

From f3a1c15b7c5629db1c53f990cf44ea72a60916b9 Mon Sep 17 00:00:00 2001
From: Katrina <katrina.stankiewicz@servicenow.com>
Date: Thu, 26 Mar 2026 11:26:28 -0400
Subject: [PATCH 05/25] move session_properties object to openai/azure only
 flows

---
 src/eva/assistant/pipeline/services.py | 41 +++++++++++++++-----------
 1 file changed, 23 insertions(+), 18 deletions(-)

diff --git a/src/eva/assistant/pipeline/services.py b/src/eva/assistant/pipeline/services.py
index 9cfb9f5c..939e9236 100644
--- a/src/eva/assistant/pipeline/services.py
+++ b/src/eva/assistant/pipeline/services.py
@@ -400,25 +400,8 @@ def create_realtime_llm_service(
                 )
         pipecat_tools = ToolsSchema(standard_tools=function_schemas)
 
-    session_properties = SessionProperties(
-        instructions=system_prompt,
-        audio=AudioConfiguration(
-            input=AudioInput(
-                transcription=InputAudioTranscription(
-                    model=params.get("transcription_model", "gpt-4o-mini-transcribe")
-                ),
-                # Set openai TurnDetection parameters. Not setting this at all will turn it on by default
-                turn_detection=SemanticTurnDetection(),
-            ),
-            output=AudioOutput(
-                voice=params.get("voice", "marin"),
-            ),
-        ),
-        tools=pipecat_tools,
-        tool_choice="auto",
-    )
-
     if model_lower.startswith("openai"):
+        session_properties = get_openai_session_properties(system_prompt, params, pipecat_tools)
         if audit_log is not None:
             logger.info(
                 f"Using InstrumentedRealtimeLLMService for audit log interception: openai: {params.get('model')}"
@@ -439,6 +422,7 @@ def create_realtime_llm_service(
         # base_url: The full Azure WebSocket endpoint URL including api-version and deployment.
         # Example: "wss://my-project.openai.azure.com/openai/v1/realtime"
         url = params.get("url", "")
+        session_properties = get_openai_session_properties(system_prompt, params, pipecat_tools)
 
         logger.info(f"Using Azure Realtime LLM: {model_lower}, url {url}")
 
@@ -476,6 +460,27 @@ def create_realtime_llm_service(
         raise ValueError(f"Unknown realtime model: {model}. Available: gpt-realtime, ultravox")
 
 
+def get_openai_session_properties(system_prompt: str, params: dict, pipecat_tools) -> SessionProperties:
+    """Create openai compatible session properties object."""
+    return SessionProperties(
+        instructions=system_prompt,
+        audio=AudioConfiguration(
+            input=AudioInput(
+                transcription=InputAudioTranscription(
+                    model=params.get("transcription_model", "gpt-4o-mini-transcribe")
+                ),
+                # Set openai TurnDetection parameters. Not setting this at all will turn it on by default
+                turn_detection=SemanticTurnDetection(),
+            ),
+            output=AudioOutput(
+                voice=params.get("voice", "marin"),
+            ),
+        ),
+        tools=pipecat_tools,
+        tool_choice="auto",
+    )
+
+
 def create_audio_llm_client(
     model: str,
     params: dict[str, Any],

From a13480d0d50d6473797f85855b5cabc70676dd6f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gabrielle=20Gauthier=20Melanc=CC=A7on?=
 <gabrielle.gm@servicenow.com>
Date: Thu, 26 Mar 2026 18:02:44 -0400
Subject: [PATCH 06/25] Add model names to timestamp

---
 src/eva/models/config.py                | 27 +++++++++++++++++++++++++
 tests/unit/models/test_config_models.py |  4 ++--
 2 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/src/eva/models/config.py b/src/eva/models/config.py
index cd8fe819..a674465d 100644
--- a/src/eva/models/config.py
+++ b/src/eva/models/config.py
@@ -39,6 +39,26 @@ def current_date_and_time():
     return f"{datetime.now(UTC):%Y-%m-%d_%H-%M-%S.%f}"
 
 
+def _model_suffix(model: Any) -> str:
+    """Build a short suffix from the model config for use in folder names."""
+    if isinstance(model, PipelineConfig):
+        parts = [
+            model.stt_params.get("alias") or model.stt_params.get("model") or model.stt or "",
+            model.llm,
+            model.tts_params.get("alias") or model.tts_params.get("model") or model.tts or "",
+        ]
+    elif isinstance(model, SpeechToSpeechConfig):
+        parts = [model.s2s_params.get("alias") or model.s2s_params.get("model") or model.s2s]
+    elif isinstance(model, AudioLLMConfig):
+        parts = [
+            model.audio_llm_params.get("alias") or model.audio_llm_params.get("model") or model.audio_llm,
+            model.tts_params.get("alias") or model.tts_params.get("model") or model.tts or "",
+        ]
+    else:
+        return ""
+    return "_".join(p for p in parts if p)
+
+
 class PipelineConfig(BaseModel):
     """Configuration for a STT + LLM + TTS pipeline."""
 
@@ -452,6 +472,13 @@ def _check_companion_services(self) -> "RunConfig":
             if not self.model.tts:
                 raise ValueError("EVA_MODEL__TTS is required when using EVA_MODEL__AUDIO_LLM (SpeechLM-TTS pipeline).")
             self._validate_service_params("TTS", self.model.tts, self.model.tts_params)
+
+        # Append model names to auto-generated run_id
+        if "run_id" not in self.model_fields_set:
+            suffix = _model_suffix(self.model)
+            if suffix:
+                self.run_id = f"{self.run_id}_{suffix}"
+
         return self
 
     # Providers that manage their own model/key resolution (e.g. WebSocket-based)
diff --git a/tests/unit/models/test_config_models.py b/tests/unit/models/test_config_models.py
index 9b77854c..47ca4873 100644
--- a/tests/unit/models/test_config_models.py
+++ b/tests/unit/models/test_config_models.py
@@ -2,7 +2,6 @@
 
 import json
 import os
-from datetime import datetime
 from pathlib import Path
 from unittest.mock import MagicMock, patch
 
@@ -81,7 +80,8 @@ def test_create_minimal_config(self):
 
         assert config.dataset_path == Path("data/airline_dataset.jsonl")
         assert config.tool_mocks_path == Path("data/airline_scenarios")
-        assert datetime.strptime(config.run_id, "%Y-%m-%d_%H-%M-%S.%f")
+        # run_id = timestamp + model suffix (e.g. "2024-01-15_14-30-45.123456_nova-2_gpt-5.2_sonic")
+        assert config.run_id.endswith("nova-2_gpt-5.2_sonic")
         assert config.max_concurrent_conversations == 1
         assert config.conversation_timeout_seconds == 360
 

From 44e521f026b1dab47a92b4e42ffc0de65d048c26 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gabrielle=20Gauthier=20Melanc=CC=A7on?=
 <gabrielle.gm@servicenow.com>
Date: Thu, 26 Mar 2026 18:04:53 -0400
Subject: [PATCH 07/25] Make sure we are not saving api keys in config.json

---
 src/eva/models/config.py                | 12 ++++++++++++
 tests/unit/models/test_config_models.py |  8 +++++++-
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/src/eva/models/config.py b/src/eva/models/config.py
index a674465d..d9e46867 100644
--- a/src/eva/models/config.py
+++ b/src/eva/models/config.py
@@ -532,6 +532,18 @@ def _redact_model_list(cls, deployments: list[ModelDeployment]) -> list[dict]:
             redacted.append(deployment)
         return redacted
 
+    @field_serializer("model")
+    @classmethod
+    def _redact_model_params(cls, model: ModelConfigUnion) -> dict:
+        """Redact secret values in STT/TTS/S2S/AudioLLM params when serializing."""
+        data = model.model_dump(mode="json")
+        for field_name, value in data.items():
+            if field_name.endswith("_params") and isinstance(value, dict):
+                for key in value:
+                    if "key" in key or "credentials" in key:
+                        value[key] = "***"
+        return data
+
     @classmethod
     def from_yaml(cls, path: Path | str) -> "RunConfig":
         """Load configuration from YAML file."""
diff --git a/tests/unit/models/test_config_models.py b/tests/unit/models/test_config_models.py
index 47ca4873..3f445544 100644
--- a/tests/unit/models/test_config_models.py
+++ b/tests/unit/models/test_config_models.py
@@ -160,13 +160,19 @@ def test_indentation_in_model_list(self, tmp_path: Path, vars_location: str, ind
         assert config.model_list == MODEL_LIST
 
     def test_secrets_redacted(self):
-        """Secrets are redacted in model_list."""
+        """Secrets are redacted in model_list and STT/TTS params."""
         config = _config(env_vars=_BASE_ENV)
         dumped = config.model_dump(mode="json")
         assert dumped["model_list"][0]["litellm_params"]["api_key"] == "***"
         assert dumped["model_list"][1]["litellm_params"]["vertex_credentials"] == "***"
         assert dumped["model_list"][2]["litellm_params"]["aws_access_key_id"] == "***"
         assert dumped["model_list"][2]["litellm_params"]["aws_secret_access_key"] == "***"
+        # STT/TTS params api_key must also be redacted
+        assert dumped["model"]["stt_params"]["api_key"] == "***"
+        assert dumped["model"]["tts_params"]["api_key"] == "***"
+        # Non-secret fields preserved
+        assert dumped["model"]["stt_params"]["model"] == "nova-2"
+        assert dumped["model"]["tts_params"]["model"] == "sonic"
 
     @pytest.mark.parametrize(
         "environ, expected_exception, expected_message",

From 1466cec64f470ac6a465c74dec00340f164c62fb Mon Sep 17 00:00:00 2001
From: Katrina <katrina.stankiewicz@servicenow.com>
Date: Thu, 26 Mar 2026 18:06:01 -0400
Subject: [PATCH 08/25] make api_key required for realtime models

---
 .../assistant/pipeline/audio_llm_processor.py   |  3 +--
 src/eva/assistant/pipeline/services.py          | 11 +++++------
 src/eva/models/config.py                        | 17 ++++++++++++-----
 3 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/src/eva/assistant/pipeline/audio_llm_processor.py b/src/eva/assistant/pipeline/audio_llm_processor.py
index a9154d4e..bb5b24b3 100644
--- a/src/eva/assistant/pipeline/audio_llm_processor.py
+++ b/src/eva/assistant/pipeline/audio_llm_processor.py
@@ -19,7 +19,6 @@
 import asyncio
 import base64
 import io
-import os
 import time
 import wave
 from collections.abc import Awaitable
@@ -418,7 +417,7 @@ def __init__(
         super().__init__(**kwargs)
         self._audio_collector = audio_collector
         params = params or {}
-        self._api_key = params.get("api_key") or os.getenv("OPENAI_API_KEY")
+        self._api_key = params.get["api_key"]
         self._model = model
         self._system_prompt = system_prompt or self.DEFAULT_SYSTEM_PROMPT
         self._sample_rate = sample_rate
diff --git a/src/eva/assistant/pipeline/services.py b/src/eva/assistant/pipeline/services.py
index 939e9236..c750d6b2 100644
--- a/src/eva/assistant/pipeline/services.py
+++ b/src/eva/assistant/pipeline/services.py
@@ -4,7 +4,6 @@
 """
 
 import datetime
-import os
 from typing import Any, AsyncGenerator, Optional
 
 from deepgram import LiveOptions
@@ -409,12 +408,12 @@ def create_realtime_llm_service(
             return InstrumentedRealtimeLLMService(
                 model=params.get("model"),
                 audit_log=audit_log,
-                api_key=params.get("api_key") or os.getenv("OPENAI_API_KEY"),
+                api_key=params.get["api_key"],
                 session_properties=session_properties,
             )
 
         return OpenAIRealtimeLLMService(
-            api_key=params.get("api_key"),
+            api_key=params.get["api_key"],
             session_properties=session_properties,
         )
     elif model_lower.startswith("azure") or model_lower.startswith("gpt-realtime"):
@@ -431,7 +430,7 @@ def create_realtime_llm_service(
             service = InstrumentedRealtimeLLMService(
                 model=params.get("model"),
                 audit_log=audit_log,
-                api_key=params.get("api_key"),
+                api_key=params.get["api_key"],
                 base_url=url,
                 session_properties=session_properties,
             )
@@ -439,7 +438,7 @@ def create_realtime_llm_service(
             return service
 
         return OpenAIRealtimeLLMService(
-            api_key=params.get("api_key"),
+            api_key=params.get["api_key"],
             base_url=url,
             session_properties=session_properties,
         )
@@ -447,7 +446,7 @@ def create_realtime_llm_service(
         logger.info("Using Ultravox LLM")
         return UltravoxRealtimeLLMService(
             params=OneShotInputParams(
-                api_key=params.get("api_key"),
+                api_key=params.get["api_key"],
                 system_prompt=system_prompt,
                 temperature=0.3,
                 max_duration=datetime.timedelta(minutes=6),
diff --git a/src/eva/models/config.py b/src/eva/models/config.py
index 99b706b6..6a7ec0e9 100644
--- a/src/eva/models/config.py
+++ b/src/eva/models/config.py
@@ -452,28 +452,35 @@ def _warn_deprecated_aliases(cls, data: Any) -> Any:
     @model_validator(mode="after")
     def _check_companion_services(self) -> "RunConfig":
         """Ensure required companion services are set for each pipeline mode."""
+        required_keys = ["api_key", "model"]
         if isinstance(self.model, PipelineConfig):
             if not self.model.stt:
                 raise ValueError("EVA_MODEL__STT is required when using EVA_MODEL__LLM (ASR-LLM-TTS pipeline).")
             if not self.model.tts:
                 raise ValueError("EVA_MODEL__TTS is required when using EVA_MODEL__LLM (ASR-LLM-TTS pipeline).")
-            self._validate_service_params("STT", self.model.stt, self.model.stt_params)
-            self._validate_service_params("TTS", self.model.tts, self.model.tts_params)
+            self._validate_service_params("STT", self.model.stt, required_keys, self.model.stt_params)
+            self._validate_service_params("TTS", self.model.tts, required_keys, self.model.tts_params)
         elif isinstance(self.model, AudioLLMConfig):
             if not self.model.tts:
                 raise ValueError("EVA_MODEL__TTS is required when using EVA_MODEL__AUDIO_LLM (SpeechLM-TTS pipeline).")
-            self._validate_service_params("TTS", self.model.tts, self.model.tts_params)
+            self._validate_service_params("TTS", self.model.tts, required_keys, self.model.tts_params)
+            self._validate_service_params("audio_llm", self.model.audio_llm, required_keys, self.model.audio_llm_params)
+        elif isinstance(self.model, SpeechToSpeechConfig):
+            # api_key is required, some s2s services don't require model
+            self._validate_service_params("S2S", self.model.s2s, ["api_key"], self.model.s2s_params)
         return self
 
     # Providers that manage their own model/key resolution (e.g. WebSocket-based)
     _SKIP_PARAMS_VALIDATION: ClassVar[set[str]] = {"nvidia"}
 
     @classmethod
-    def _validate_service_params(cls, service: str, provider: str, params: dict[str, Any]) -> None:
+    def _validate_service_params(
+        cls, service: str, provider: str, required_keys: list[str], params: dict[str, Any]
+    ) -> None:
         """Validate that STT/TTS params contain required keys."""
         if provider.lower() in cls._SKIP_PARAMS_VALIDATION:
             return
-        missing = [key for key in ("api_key", "model") if key not in params]
+        missing = [key for key in required_keys if key not in params]
         if missing:
             missing_str = " and ".join(f'"{k}"' for k in missing)
             env_var = f"EVA_MODEL__{service}_PARAMS"

From 74a1c018ee55d6f0cc0ad2699ae1f60dfaf001be Mon Sep 17 00:00:00 2001
From: Katrina <katrina.stankiewicz@servicenow.com>
Date: Thu, 26 Mar 2026 18:25:04 -0400
Subject: [PATCH 09/25] fix test now that api_key is mandatory for realtime
 models

---
 tests/unit/models/test_config_models.py | 37 +++++++++++++++++--------
 1 file changed, 26 insertions(+), 11 deletions(-)

diff --git a/tests/unit/models/test_config_models.py b/tests/unit/models/test_config_models.py
index 9b77854c..81e4179f 100644
--- a/tests/unit/models/test_config_models.py
+++ b/tests/unit/models/test_config_models.py
@@ -56,6 +56,10 @@
     "EVA_MODEL__STT_PARAMS": json.dumps({"api_key": "test_key", "model": "nova-2"}),
     "EVA_MODEL__TTS_PARAMS": json.dumps({"api_key": "test_key", "model": "sonic"}),
 }
+_S2S_ENV = _EVA_MODEL_LIST_ENV | {
+    "EVA_MODEL__S2S": "gpt-realtime-mini",
+    "EVA_MODEL__S2S_PARAMS": json.dumps({"api_key": ""}),
+}
 
 
 def _config(
@@ -355,14 +359,14 @@ class TestDeprecatedEnvVars:
                 lambda c: c.model.tts,
             ),
             (
-                _EVA_MODEL_LIST_ENV,
+                _S2S_ENV,
                 "REALTIME_MODEL",
                 "EVA_MODEL__S2S",
                 "test-model",
                 lambda c: c.model.s2s,
             ),
             (
-                _EVA_MODEL_LIST_ENV,
+                _S2S_ENV,
                 "EVA_MODEL__REALTIME_MODEL",
                 "EVA_MODEL__S2S",
                 "test-model",
@@ -383,17 +387,17 @@ class TestDeprecatedEnvVars:
                 lambda c: c.model.tts_params,
             ),
             (
-                _EVA_MODEL_LIST_ENV | {"EVA_MODEL__S2S": "test-model"},
+                _S2S_ENV,
                 "REALTIME_MODEL_PARAMS",
                 "EVA_MODEL__S2S_PARAMS",
-                {"foo": "bar"},
+                {"api_key": "k"},
                 lambda c: c.model.s2s_params,
             ),
             (
-                _EVA_MODEL_LIST_ENV | {"EVA_MODEL__S2S": "test-model"},
+                _S2S_ENV,
                 "EVA_MODEL__REALTIME_MODEL_PARAMS",
                 "EVA_MODEL__S2S_PARAMS",
-                {"foo": "bar"},
+                {"api_key": "k"},
                 lambda c: c.model.s2s_params,
             ),
             (
@@ -580,7 +584,7 @@ def test_tts_model(self):
         assert c.model.tts == "cartesia"
 
     def test_realtime_model(self):
-        config = _config(env_vars=_EVA_MODEL_LIST_ENV, cli_args=["--realtime-model", "test-model"])
+        config = _config(env_vars=_S2S_ENV, cli_args=["--realtime-model", "test-model"])
         assert config.model.s2s == "test-model"
 
     def test_domain_cli(self):
@@ -656,20 +660,31 @@ class TestSpeechToSpeechConfig:
 
     def test_s2s_config_from_env(self):
         """EVA_MODEL__S2S selects SpeechToSpeechConfig."""
-        config = _config(env_vars=_EVA_MODEL_LIST_ENV | {"EVA_MODEL__S2S": "gpt-realtime-mini"})
+        config = _config(
+            env_vars=_EVA_MODEL_LIST_ENV
+            | {
+                "EVA_MODEL__S2S": "gpt-realtime-mini",
+                "EVA_MODEL__S2S_PARAMS": json.dumps({"api_key": ""}),
+            }
+        )
         assert isinstance(config.model, SpeechToSpeechConfig)
         assert config.model.s2s == "gpt-realtime-mini"
 
     def test_s2s_config_from_cli(self):
         """--s2s-model selects SpeechToSpeechConfig."""
-        config = _config(env_vars=_EVA_MODEL_LIST_ENV, cli_args=["--model.s2s", "gemini_live"])
+        config = _config(
+            env_vars=_EVA_MODEL_LIST_ENV,
+            cli_args=["--model.s2s", "gemini_live", "--model.s2s-params", '{"api_key": "test-key"}'],
+        )
         assert isinstance(config.model, SpeechToSpeechConfig)
         assert config.model.s2s == "gemini_live"
+        assert config.model.s2s_params == {"api_key": "test-key"}
 
     def test_s2s_config_with_params(self):
         """S2S params are passed through."""
         config = _config(
-            env_vars=_EVA_MODEL_LIST_ENV, model={"s2s": "gpt-realtime-mini", "s2s_params": {"voice": "alloy"}}
+            env_vars=_EVA_MODEL_LIST_ENV,
+            model={"s2s": "gpt-realtime-mini", "s2s_params": {"voice": "alloy", "api_key": "key_1"}},
         )
         assert isinstance(config.model, SpeechToSpeechConfig)
-        assert config.model.s2s_params == {"voice": "alloy"}
+        assert config.model.s2s_params == {"voice": "alloy", "api_key": "key_1"}

From aee752d5ffafe641dc4425a9c484bb9272d6b6c2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gabrielle=20Gauthier=20Melanc=CC=A7on?=
 <gabrielle.gm@servicenow.com>
Date: Thu, 26 Mar 2026 18:25:06 -0400
Subject: [PATCH 10/25] On rerun read the api keys from .env and not
 config.json

---
 src/eva/models/config.py                | 56 +++++++++++++++++++
 src/eva/run_benchmark.py                |  3 ++
 tests/unit/models/test_config_models.py | 71 +++++++++++++++++++++++++
 3 files changed, 130 insertions(+)

diff --git a/src/eva/models/config.py b/src/eva/models/config.py
index d9e46867..6e6a4ce7 100644
--- a/src/eva/models/config.py
+++ b/src/eva/models/config.py
@@ -12,6 +12,7 @@
 ``RunConfig(_env_file=".env", _cli_parse_args=True)``.
 """
 
+import logging
 from datetime import UTC, datetime
 from pathlib import Path
 from typing import Annotated, Any, ClassVar, Literal
@@ -34,6 +35,8 @@
 
 from eva.models.provenance import RunProvenance
 
+logger = logging.getLogger(__name__)
+
 
 def current_date_and_time():
     return f"{datetime.now(UTC):%Y-%m-%d_%H-%M-%S.%f}"
@@ -544,6 +547,59 @@ def _redact_model_params(cls, model: ModelConfigUnion) -> dict:
                         value[key] = "***"
         return data
 
+    def restore_redacted_secrets(self, live: "RunConfig") -> None:
+        """Replace redacted ``***`` values in ``*_params`` dicts with real values from *live* config.
+
+        Raises:
+            ValueError: If the saved and live configs use different providers or aliases
+                for any service that has redacted secrets.
+        """
+        # Map each params field to its provider field (e.g. stt_params -> stt)
+        _PARAMS_TO_PROVIDER = {
+            "stt_params": "stt",
+            "tts_params": "tts",
+            "s2s_params": "s2s",
+            "audio_llm_params": "audio_llm",
+        }
+        for params_field, provider_field in _PARAMS_TO_PROVIDER.items():
+            saved = getattr(self.model, params_field, None)
+            source = getattr(live.model, params_field, None)
+            if not isinstance(saved, dict) or not isinstance(source, dict):
+                continue
+            has_redacted = any(v == "***" for v in saved.values())
+            if not has_redacted:
+                continue
+
+            # Check provider matches (e.g. stt: "deepgram" vs "cartesia")
+            saved_provider = getattr(self.model, provider_field, None)
+            live_provider = getattr(live.model, provider_field, None)
+            if saved_provider != live_provider:
+                raise ValueError(
+                    f"Cannot restore secrets: saved {provider_field}={saved_provider!r} "
+                    f"but current environment has {provider_field}={live_provider!r}"
+                )
+
+            # Check alias matches (strict — aliases identify a specific configuration)
+            saved_alias = saved.get("alias")
+            live_alias = source.get("alias")
+            if saved_alias and live_alias and saved_alias != live_alias:
+                raise ValueError(
+                    f"Cannot restore secrets: saved {params_field}[alias]={saved_alias!r} "
+                    f"but current environment has {params_field}[alias]={live_alias!r}"
+                )
+
+            # Warn if model changed (non-fatal — models can be updated)
+            saved_model = saved.get("model")
+            live_model = source.get("model")
+            if saved_model and live_model and saved_model != live_model:
+                logger.warning(
+                    f"Model mismatch for {params_field}: saved {saved_model!r}, current environment has {live_model!r}"
+                )
+
+            for key, value in saved.items():
+                if value == "***" and key in source:
+                    saved[key] = source[key]
+
     @classmethod
     def from_yaml(cls, path: Path | str) -> "RunConfig":
         """Load configuration from YAML file."""
diff --git a/src/eva/run_benchmark.py b/src/eva/run_benchmark.py
index 92d32b01..78a66843 100644
--- a/src/eva/run_benchmark.py
+++ b/src/eva/run_benchmark.py
@@ -42,6 +42,9 @@ async def run_benchmark(config: RunConfig) -> int:
             logger.error(str(e))
             return 1
 
+        # Restore secrets redacted in config.json with live env values
+        runner.config.restore_redacted_secrets(config)
+
         # Apply CLI overrides
         runner.config.max_rerun_attempts = config.max_rerun_attempts
         runner.config.force_rerun_metrics = config.force_rerun_metrics
diff --git a/tests/unit/models/test_config_models.py b/tests/unit/models/test_config_models.py
index 3f445544..69e5fbd2 100644
--- a/tests/unit/models/test_config_models.py
+++ b/tests/unit/models/test_config_models.py
@@ -174,6 +174,77 @@ def test_secrets_redacted(self):
         assert dumped["model"]["stt_params"]["model"] == "nova-2"
         assert dumped["model"]["tts_params"]["model"] == "sonic"
 
+    def test_restore_redacted_secrets(self):
+        """Redacted secrets are restored from a live config."""
+        config = _config(env_vars=_BASE_ENV)
+        # Simulate round-trip through config.json (redacted on dump, loaded back)
+        dumped_json = config.model_dump_json()
+        loaded = RunConfig.model_validate_json(dumped_json)
+        assert loaded.model.stt_params["api_key"] == "***"
+        assert loaded.model.tts_params["api_key"] == "***"
+
+        # Restore from live config (which has real keys from env)
+        loaded.restore_redacted_secrets(config)
+        assert loaded.model.stt_params["api_key"] == "test_key"
+        assert loaded.model.tts_params["api_key"] == "test_key"
+        # Non-secret fields unchanged
+        assert loaded.model.stt_params["model"] == "nova-2"
+
+    def test_restore_redacted_secrets_provider_mismatch(self):
+        """Restoring secrets fails if the STT/TTS provider changed."""
+        config = _config(env_vars=_BASE_ENV)
+        dumped_json = config.model_dump_json()
+        loaded = RunConfig.model_validate_json(dumped_json)
+
+        # Live config uses a different STT provider
+        live = _config(
+            env_vars=_BASE_ENV
+            | {
+                "EVA_MODEL__STT": "openai_whisper",
+                "EVA_MODEL__STT_PARAMS": json.dumps({"api_key": "k", "model": "whisper-1"}),
+            }
+        )
+        with pytest.raises(ValueError, match=r"saved stt='deepgram'.*current environment has stt='openai_whisper'"):
+            loaded.restore_redacted_secrets(live)
+
+    def test_restore_redacted_secrets_model_mismatch_warns(self, caplog):
+        """Restoring secrets warns (but succeeds) if the STT/TTS model changed."""
+        config = _config(env_vars=_BASE_ENV)
+        dumped_json = config.model_dump_json()
+        loaded = RunConfig.model_validate_json(dumped_json)
+
+        # Same provider, different model
+        live = _config(env_vars=_BASE_ENV | {"EVA_MODEL__TTS_PARAMS": json.dumps({"api_key": "k", "model": "sonic-2"})})
+        with caplog.at_level("WARNING", logger="eva.models.config"):
+            loaded.restore_redacted_secrets(live)
+        assert "sonic" in caplog.text
+        assert "sonic-2" in caplog.text
+        # Secrets still restored despite the warning
+        assert loaded.model.tts_params["api_key"] == "k"
+
+    def test_restore_redacted_secrets_alias_mismatch(self):
+        """Restoring secrets fails if the alias changed."""
+        config = _config(
+            env_vars=_BASE_ENV
+            | {
+                "EVA_MODEL__STT_PARAMS": json.dumps({"api_key": "k", "model": "nova-2", "alias": "stt-v1"}),
+            }
+        )
+        dumped_json = config.model_dump_json()
+        loaded = RunConfig.model_validate_json(dumped_json)
+
+        live = _config(
+            env_vars=_BASE_ENV
+            | {
+                "EVA_MODEL__STT_PARAMS": json.dumps({"api_key": "k", "model": "nova-2", "alias": "stt-v2"}),
+            }
+        )
+        with pytest.raises(
+            ValueError,
+            match=r"saved stt_params\[alias\]='stt-v1'.*current environment has stt_params\[alias\]='stt-v2'",
+        ):
+            loaded.restore_redacted_secrets(live)
+
     @pytest.mark.parametrize(
         "environ, expected_exception, expected_message",
         (

From e8855ab108303460454c193d354ceb893024791a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gabrielle=20Gauthier=20Melanc=CC=A7on?=
 <gabrielle.gm@servicenow.com>
Date: Thu, 26 Mar 2026 18:55:23 -0400
Subject: [PATCH 11/25] Make sure to not mutate the api keys in memory

---
 src/eva/models/config.py                |  2 ++
 tests/unit/models/test_config_models.py | 11 +++++++++++
 2 files changed, 13 insertions(+)

diff --git a/src/eva/models/config.py b/src/eva/models/config.py
index 6e6a4ce7..e0dac293 100644
--- a/src/eva/models/config.py
+++ b/src/eva/models/config.py
@@ -12,6 +12,7 @@
 ``RunConfig(_env_file=".env", _cli_parse_args=True)``.
 """
 
+import copy
 import logging
 from datetime import UTC, datetime
 from pathlib import Path
@@ -527,6 +528,7 @@ def _redact_model_list(cls, deployments: list[ModelDeployment]) -> list[dict]:
         """Redact secret values in litellm_params when serializing."""
         redacted = []
         for deployment in deployments:
+            deployment = copy.deepcopy(deployment)
             if "litellm_params" in deployment:
                 params = deployment["litellm_params"]
                 for key in params:
diff --git a/tests/unit/models/test_config_models.py b/tests/unit/models/test_config_models.py
index 69e5fbd2..079ce7e6 100644
--- a/tests/unit/models/test_config_models.py
+++ b/tests/unit/models/test_config_models.py
@@ -174,6 +174,17 @@ def test_secrets_redacted(self):
         assert dumped["model"]["stt_params"]["model"] == "nova-2"
         assert dumped["model"]["tts_params"]["model"] == "sonic"
 
+    def test_secrets_redaction_does_not_mutate_live_config(self):
+        """Serializing must not corrupt the in-memory config objects."""
+        config = _config(env_vars=_BASE_ENV)
+        config.model_dump(mode="json")
+        # model_list keys must still hold real values
+        assert config.model_list[0]["litellm_params"]["api_key"] == "must_be_redacted"
+        assert config.model_list[1]["litellm_params"]["vertex_credentials"] == "must_be_redacted"
+        # STT/TTS params must still hold real values
+        assert config.model.stt_params["api_key"] == "test_key"
+        assert config.model.tts_params["api_key"] == "test_key"
+
     def test_restore_redacted_secrets(self):
         """Redacted secrets are restored from a live config."""
         config = _config(env_vars=_BASE_ENV)

From 0f3a04b2c55e301c2217e90938d05a77fc007fd5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gabrielle=20Gauthier=20Melanc=CC=A7on?=
 <gabrielle.gm@servicenow.com>
Date: Thu, 26 Mar 2026 19:13:58 -0400
Subject: [PATCH 12/25] Use same strategy for litellm

---
 src/eva/models/config.py                | 41 ++++++++++----
 tests/unit/models/test_config_models.py | 72 ++++++++++++++++++-------
 2 files changed, 83 insertions(+), 30 deletions(-)

diff --git a/src/eva/models/config.py b/src/eva/models/config.py
index e0dac293..c7a67f8e 100644
--- a/src/eva/models/config.py
+++ b/src/eva/models/config.py
@@ -550,13 +550,15 @@ def _redact_model_params(cls, model: ModelConfigUnion) -> dict:
         return data
 
     def restore_redacted_secrets(self, live: "RunConfig") -> None:
-        """Replace redacted ``***`` values in ``*_params`` dicts with real values from *live* config.
+        """Replace ``***`` values in this config with real values from *live*.
+
+        Covers both ``model.*_params`` (STT/TTS/S2S/AudioLLM secrets) and
+        ``model_list[].litellm_params`` (LLM deployment secrets).
 
         Raises:
-            ValueError: If the saved and live configs use different providers or aliases
-                for any service that has redacted secrets.
+            ValueError: If provider or alias differs for a service with redacted secrets.
         """
-        # Map each params field to its provider field (e.g. stt_params -> stt)
+        # ── model.*_params (STT / TTS / S2S / AudioLLM) ──
         _PARAMS_TO_PROVIDER = {
             "stt_params": "stt",
             "tts_params": "tts",
@@ -568,11 +570,9 @@ def restore_redacted_secrets(self, live: "RunConfig") -> None:
             source = getattr(live.model, params_field, None)
             if not isinstance(saved, dict) or not isinstance(source, dict):
                 continue
-            has_redacted = any(v == "***" for v in saved.values())
-            if not has_redacted:
+            if not any(v == "***" for v in saved.values()):
                 continue
 
-            # Check provider matches (e.g. stt: "deepgram" vs "cartesia")
             saved_provider = getattr(self.model, provider_field, None)
             live_provider = getattr(live.model, provider_field, None)
             if saved_provider != live_provider:
@@ -581,7 +581,6 @@ def restore_redacted_secrets(self, live: "RunConfig") -> None:
                     f"but current environment has {provider_field}={live_provider!r}"
                 )
 
-            # Check alias matches (strict — aliases identify a specific configuration)
             saved_alias = saved.get("alias")
             live_alias = source.get("alias")
             if saved_alias and live_alias and saved_alias != live_alias:
@@ -590,18 +589,40 @@ def restore_redacted_secrets(self, live: "RunConfig") -> None:
                     f"but current environment has {params_field}[alias]={live_alias!r}"
                 )
 
-            # Warn if model changed (non-fatal — models can be updated)
             saved_model = saved.get("model")
             live_model = source.get("model")
             if saved_model and live_model and saved_model != live_model:
                 logger.warning(
-                    f"Model mismatch for {params_field}: saved {saved_model!r}, current environment has {live_model!r}"
+                    "Model mismatch for %s: saved %r, current environment has %r",
+                    params_field,
+                    saved_model,
+                    live_model,
                 )
 
             for key, value in saved.items():
                 if value == "***" and key in source:
                     saved[key] = source[key]
 
+        # ── model_list[].litellm_params (LLM deployments) ──
+        live_by_name = {d["model_name"]: d for d in live.model_list if "model_name" in d}
+        for deployment in self.model_list:
+            name = deployment.get("model_name")
+            if not name:
+                continue
+            saved_params = deployment.get("litellm_params", {})
+            has_redacted = any(v == "***" for v in saved_params.values())
+            if not has_redacted:
+                continue
+            if name not in live_by_name:
+                raise ValueError(
+                    f"Cannot restore secrets: deployment {name!r} not found in "
+                    f"current EVA_MODEL_LIST (available: {list(live_by_name)})"
+                )
+            live_params = live_by_name[name].get("litellm_params", {})
+            for key, value in saved_params.items():
+                if value == "***" and key in live_params:
+                    saved_params[key] = live_params[key]
+
     @classmethod
     def from_yaml(cls, path: Path | str) -> "RunConfig":
         """Load configuration from YAML file."""
diff --git a/tests/unit/models/test_config_models.py b/tests/unit/models/test_config_models.py
index 079ce7e6..7b00573e 100644
--- a/tests/unit/models/test_config_models.py
+++ b/tests/unit/models/test_config_models.py
@@ -186,20 +186,29 @@ def test_secrets_redaction_does_not_mutate_live_config(self):
         assert config.model.tts_params["api_key"] == "test_key"
 
     def test_restore_redacted_secrets(self):
-        """Redacted secrets are restored from a live config."""
+        """Redacted secrets are restored from a live config for both model and model_list."""
         config = _config(env_vars=_BASE_ENV)
-        # Simulate round-trip through config.json (redacted on dump, loaded back)
         dumped_json = config.model_dump_json()
         loaded = RunConfig.model_validate_json(dumped_json)
+
+        # Everything is redacted after round-trip
         assert loaded.model.stt_params["api_key"] == "***"
         assert loaded.model.tts_params["api_key"] == "***"
+        assert loaded.model_list[0]["litellm_params"]["api_key"] == "***"
+        assert loaded.model_list[1]["litellm_params"]["vertex_credentials"] == "***"
+        assert loaded.model_list[2]["litellm_params"]["aws_access_key_id"] == "***"
 
-        # Restore from live config (which has real keys from env)
         loaded.restore_redacted_secrets(config)
+
+        # STT/TTS params restored
         assert loaded.model.stt_params["api_key"] == "test_key"
         assert loaded.model.tts_params["api_key"] == "test_key"
-        # Non-secret fields unchanged
         assert loaded.model.stt_params["model"] == "nova-2"
+        # model_list restored
+        assert loaded.model_list[0]["litellm_params"]["api_key"] == "must_be_redacted"
+        assert loaded.model_list[1]["litellm_params"]["vertex_credentials"] == "must_be_redacted"
+        assert loaded.model_list[2]["litellm_params"]["aws_access_key_id"] == "must_be_redacted"
+        assert loaded.model_list[2]["litellm_params"]["aws_secret_access_key"] == "must_be_redacted"
 
     def test_restore_redacted_secrets_provider_mismatch(self):
         """Restoring secrets fails if the STT/TTS provider changed."""
@@ -207,7 +216,6 @@ def test_restore_redacted_secrets_provider_mismatch(self):
         dumped_json = config.model_dump_json()
         loaded = RunConfig.model_validate_json(dumped_json)
 
-        # Live config uses a different STT provider
         live = _config(
             env_vars=_BASE_ENV
             | {
@@ -218,21 +226,6 @@ def test_restore_redacted_secrets_provider_mismatch(self):
         with pytest.raises(ValueError, match=r"saved stt='deepgram'.*current environment has stt='openai_whisper'"):
             loaded.restore_redacted_secrets(live)
 
-    def test_restore_redacted_secrets_model_mismatch_warns(self, caplog):
-        """Restoring secrets warns (but succeeds) if the STT/TTS model changed."""
-        config = _config(env_vars=_BASE_ENV)
-        dumped_json = config.model_dump_json()
-        loaded = RunConfig.model_validate_json(dumped_json)
-
-        # Same provider, different model
-        live = _config(env_vars=_BASE_ENV | {"EVA_MODEL__TTS_PARAMS": json.dumps({"api_key": "k", "model": "sonic-2"})})
-        with caplog.at_level("WARNING", logger="eva.models.config"):
-            loaded.restore_redacted_secrets(live)
-        assert "sonic" in caplog.text
-        assert "sonic-2" in caplog.text
-        # Secrets still restored despite the warning
-        assert loaded.model.tts_params["api_key"] == "k"
-
     def test_restore_redacted_secrets_alias_mismatch(self):
         """Restoring secrets fails if the alias changed."""
         config = _config(
@@ -256,6 +249,45 @@ def test_restore_redacted_secrets_alias_mismatch(self):
         ):
             loaded.restore_redacted_secrets(live)
 
+    def test_restore_redacted_secrets_model_mismatch_warns(self, caplog):
+        """Restoring secrets warns (but succeeds) if the STT/TTS model changed."""
+        config = _config(env_vars=_BASE_ENV)
+        dumped_json = config.model_dump_json()
+        loaded = RunConfig.model_validate_json(dumped_json)
+
+        live = _config(env_vars=_BASE_ENV | {"EVA_MODEL__TTS_PARAMS": json.dumps({"api_key": "k", "model": "sonic-2"})})
+        with caplog.at_level("WARNING", logger="eva.models.config"):
+            loaded.restore_redacted_secrets(live)
+        assert "sonic" in caplog.text
+        assert "sonic-2" in caplog.text
+        assert loaded.model.tts_params["api_key"] == "k"
+
+    def test_restore_redacted_secrets_llm_deployment_mismatch(self):
+        """Restoring secrets fails if a saved LLM deployment is missing from the live model_list."""
+        config = _config(env_vars=_BASE_ENV)
+        dumped_json = config.model_dump_json()
+        loaded = RunConfig.model_validate_json(dumped_json)
+
+        # Live config has a different model_list (only one deployment, different name)
+        different_model_list = [
+            {
+                "model_name": "gpt-4o",
+                "litellm_params": {"model": "openai/gpt-4o", "api_key": "real_key"},
+            }
+        ]
+        live = _config(
+            env_vars={
+                "EVA_MODEL_LIST": json.dumps(different_model_list),
+                "EVA_MODEL__LLM": "gpt-4o",
+                "EVA_MODEL__STT": "deepgram",
+                "EVA_MODEL__TTS": "cartesia",
+                "EVA_MODEL__STT_PARAMS": json.dumps({"api_key": "k", "model": "nova-2"}),
+                "EVA_MODEL__TTS_PARAMS": json.dumps({"api_key": "k", "model": "sonic"}),
+            }
+        )
+        with pytest.raises(ValueError, match=r"deployment 'gpt-5.2' not found in current EVA_MODEL_LIST"):
+            loaded.restore_redacted_secrets(live)
+
     @pytest.mark.parametrize(
         "environ, expected_exception, expected_message",
         (

From ebd0a2691485ccb2d6a41699f72f63ec2ba68f9e Mon Sep 17 00:00:00 2001
From: tara-servicenow <tara.bogavelli@servicenow.com>
Date: Thu, 26 Mar 2026 17:09:05 -0700
Subject: [PATCH 13/25] Fix param get

---
 src/eva/assistant/pipeline/services.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/eva/assistant/pipeline/services.py b/src/eva/assistant/pipeline/services.py
index 328b18aa..1b735824 100644
--- a/src/eva/assistant/pipeline/services.py
+++ b/src/eva/assistant/pipeline/services.py
@@ -418,12 +418,12 @@ def create_realtime_llm_service(
             return InstrumentedRealtimeLLMService(
                 model=params.get("model"),
                 audit_log=audit_log,
-                api_key=params.get["api_key"],
+                api_key=params["api_key"],
                 session_properties=session_properties,
             )
 
         return OpenAIRealtimeLLMService(
-            api_key=params.get["api_key"],
+            api_key=params["api_key"],
             session_properties=session_properties,
         )
     elif model_lower.startswith("azure") or model_lower.startswith("gpt-realtime"):
@@ -440,7 +440,7 @@ def create_realtime_llm_service(
             service = InstrumentedRealtimeLLMService(
                 model=params.get("model"),
                 audit_log=audit_log,
-                api_key=params.get["api_key"],
+                api_key=params["api_key"],
                 base_url=url,
                 session_properties=session_properties,
             )
@@ -448,7 +448,7 @@ def create_realtime_llm_service(
             return service
 
         return OpenAIRealtimeLLMService(
-            api_key=params.get["api_key"],
+            api_key=params["api_key"],
             base_url=url,
             session_properties=session_properties,
         )
@@ -456,7 +456,7 @@ def create_realtime_llm_service(
         logger.info("Using Ultravox LLM")
         return UltravoxRealtimeLLMService(
             params=OneShotInputParams(
-                api_key=params.get["api_key"],
+                api_key=params["api_key"],
                 system_prompt=system_prompt,
                 temperature=0.3,
                 max_duration=datetime.timedelta(minutes=6),

From 57299567a7074a3d4c57045c3d7df5831e7107d0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gabrielle=20Gauthier=20Melanc=CC=A7on?=
 <gabrielle.gm@servicenow.com>
Date: Mon, 30 Mar 2026 16:58:21 -0400
Subject: [PATCH 14/25] Refactor pipeline name

---
 src/eva/models/config.py       | 49 +++++++++++++++++++---------------
 src/eva/orchestrator/runner.py |  4 ++-
 2 files changed, 31 insertions(+), 22 deletions(-)

diff --git a/src/eva/models/config.py b/src/eva/models/config.py
index c7a67f8e..7cac90c4 100644
--- a/src/eva/models/config.py
+++ b/src/eva/models/config.py
@@ -43,24 +43,9 @@ def current_date_and_time():
     return f"{datetime.now(UTC):%Y-%m-%d_%H-%M-%S.%f}"
 
 
-def _model_suffix(model: Any) -> str:
-    """Build a short suffix from the model config for use in folder names."""
-    if isinstance(model, PipelineConfig):
-        parts = [
-            model.stt_params.get("alias") or model.stt_params.get("model") or model.stt or "",
-            model.llm,
-            model.tts_params.get("alias") or model.tts_params.get("model") or model.tts or "",
-        ]
-    elif isinstance(model, SpeechToSpeechConfig):
-        parts = [model.s2s_params.get("alias") or model.s2s_params.get("model") or model.s2s]
-    elif isinstance(model, AudioLLMConfig):
-        parts = [
-            model.audio_llm_params.get("alias") or model.audio_llm_params.get("model") or model.audio_llm,
-            model.tts_params.get("alias") or model.tts_params.get("model") or model.tts or "",
-        ]
-    else:
-        return ""
-    return "_".join(p for p in parts if p)
+def _param_alias(params: dict[str, Any]) -> str:
+    """Return the display alias from a params dict."""
+    return params.get("alias") or params.get("model") or ""
 
 
 class PipelineConfig(BaseModel):
@@ -97,6 +82,16 @@ class PipelineConfig(BaseModel):
         ),
     )
 
+    @property
+    def pipeline_name(self) -> str:
+        """Short name for use in folder names."""
+        parts = [
+            _param_alias(self.stt_params) or self.stt or "",
+            self.llm,
+            _param_alias(self.tts_params) or self.tts or "",
+        ]
+        return "_".join(p for p in parts if p)
+
     @model_validator(mode="before")
     @classmethod
     def _migrate_legacy_fields(cls, data: Any) -> Any:
@@ -121,6 +116,11 @@ class SpeechToSpeechConfig(BaseModel):
     s2s: str = Field(description="Speech-to-speech model name", examples=["gpt-realtime-mini", "gemini_live"])
     s2s_params: dict[str, Any] = Field({}, description="Additional speech-to-speech model parameters (JSON)")
 
+    @property
+    def pipeline_name(self) -> str:
+        """Short name for use in folder names."""
+        return _param_alias(self.s2s_params) or self.s2s
+
 
 class AudioLLMConfig(BaseModel):
     """Configuration for an Audio-LLM pipeline (audio in, text out, separate TTS).
@@ -142,6 +142,15 @@ class AudioLLMConfig(BaseModel):
     tts: str | None = Field(None, description="TTS model", examples=["cartesia", "elevenlabs"])
     tts_params: dict[str, Any] = Field({}, description="Additional TTS model parameters (JSON)")
 
+    @property
+    def pipeline_name(self) -> str:
+        """Short name for use in folder names."""
+        parts = [
+            _param_alias(self.audio_llm_params) or self.audio_llm,
+            _param_alias(self.tts_params) or self.tts or "",
+        ]
+        return "_".join(p for p in parts if p)
+
 
 _PIPELINE_FIELDS = {
     "llm",
@@ -479,9 +488,7 @@ def _check_companion_services(self) -> "RunConfig":
 
         # Append model names to auto-generated run_id
         if "run_id" not in self.model_fields_set:
-            suffix = _model_suffix(self.model)
-            if suffix:
-                self.run_id = f"{self.run_id}_{suffix}"
+            self.run_id = f"{self.run_id}_{self.model.pipeline_name}"
 
         return self
 
diff --git a/src/eva/orchestrator/runner.py b/src/eva/orchestrator/runner.py
index f92d98af..6507dace 100644
--- a/src/eva/orchestrator/runner.py
+++ b/src/eva/orchestrator/runner.py
@@ -138,7 +138,9 @@ async def run(self, records: list[EvaluationRecord]) -> RunResult:
             }
 
         config_path = self.output_dir / "config.json"
-        config_path.write_text(self.config.model_dump_json(indent=2))
+        config_data = self.config.model_dump(mode="json")
+        config_data["pipeline_name"] = self.config.model.pipeline_name
+        config_path.write_text(json.dumps(config_data, indent=2))
 
         # Build output_id list for tracking (supports pass@k)
         num_trials = self.config.num_trials

From a10e2cbe5ee311f1398eddd70736a948dcc81090 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gabrielle=20Gauthier=20Melanc=CC=A7on?=
 <gabrielle.gm@servicenow.com>
Date: Mon, 30 Mar 2026 17:04:08 -0400
Subject: [PATCH 15/25] Saving parts rather than name

---
 src/eva/models/config.py       | 37 +++++++++++++++++-----------------
 src/eva/orchestrator/runner.py |  3 ++-
 2 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/src/eva/models/config.py b/src/eva/models/config.py
index 7cac90c4..2ed25490 100644
--- a/src/eva/models/config.py
+++ b/src/eva/models/config.py
@@ -83,14 +83,13 @@ class PipelineConfig(BaseModel):
     )
 
     @property
-    def pipeline_name(self) -> str:
-        """Short name for use in folder names."""
-        parts = [
-            _param_alias(self.stt_params) or self.stt or "",
-            self.llm,
-            _param_alias(self.tts_params) or self.tts or "",
-        ]
-        return "_".join(p for p in parts if p)
+    def pipeline_parts(self) -> dict[str, str]:
+        """Component names for this pipeline."""
+        return {
+            "stt": _param_alias(self.stt_params) or self.stt or "",
+            "llm": self.llm,
+            "tts": _param_alias(self.tts_params) or self.tts or "",
+        }
 
     @model_validator(mode="before")
     @classmethod
@@ -117,9 +116,9 @@ class SpeechToSpeechConfig(BaseModel):
     s2s_params: dict[str, Any] = Field({}, description="Additional speech-to-speech model parameters (JSON)")
 
     @property
-    def pipeline_name(self) -> str:
-        """Short name for use in folder names."""
-        return _param_alias(self.s2s_params) or self.s2s
+    def pipeline_parts(self) -> dict[str, str]:
+        """Component names for this pipeline."""
+        return {"s2s": _param_alias(self.s2s_params) or self.s2s}
 
 
 class AudioLLMConfig(BaseModel):
@@ -143,13 +142,12 @@ class AudioLLMConfig(BaseModel):
     tts_params: dict[str, Any] = Field({}, description="Additional TTS model parameters (JSON)")
 
     @property
-    def pipeline_name(self) -> str:
-        """Short name for use in folder names."""
-        parts = [
-            _param_alias(self.audio_llm_params) or self.audio_llm,
-            _param_alias(self.tts_params) or self.tts or "",
-        ]
-        return "_".join(p for p in parts if p)
+    def pipeline_parts(self) -> dict[str, str]:
+        """Component names for this pipeline."""
+        return {
+            "audio_llm": _param_alias(self.audio_llm_params) or self.audio_llm,
+            "tts": _param_alias(self.tts_params) or self.tts or "",
+        }
 
 
 _PIPELINE_FIELDS = {
@@ -488,7 +486,8 @@ def _check_companion_services(self) -> "RunConfig":
 
         # Append model names to auto-generated run_id
         if "run_id" not in self.model_fields_set:
-            self.run_id = f"{self.run_id}_{self.model.pipeline_name}"
+            suffix = "_".join(v for v in self.model.pipeline_parts.values() if v)
+            self.run_id = f"{self.run_id}_{suffix}"
 
         return self
 
diff --git a/src/eva/orchestrator/runner.py b/src/eva/orchestrator/runner.py
index 6507dace..ac5a45f3 100644
--- a/src/eva/orchestrator/runner.py
+++ b/src/eva/orchestrator/runner.py
@@ -139,7 +139,8 @@ async def run(self, records: list[EvaluationRecord]) -> RunResult:
 
         config_path = self.output_dir / "config.json"
         config_data = self.config.model_dump(mode="json")
-        config_data["pipeline_name"] = self.config.model.pipeline_name
+        pipeline_parts = self.config.model.pipeline_parts
+        config_data["pipeline_parts"] = pipeline_parts
         config_path.write_text(json.dumps(config_data, indent=2))
 
         # Build output_id list for tracking (supports pass@k)

From 98c6bbf2e63a95d04a86d7856f13dce5cdcc1a1d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gabrielle=20Gauthier=20Melanc=CC=A7on?=
 <gabrielle.gm@servicenow.com>
Date: Tue, 31 Mar 2026 13:50:28 -0400
Subject: [PATCH 16/25] Read url from .env file and rename

---
 src/eva/models/config.py                | 120 +++++++++++++++---------
 src/eva/run_benchmark.py                |   4 +-
 tests/unit/models/test_config_models.py |  56 +++++++++--
 3 files changed, 125 insertions(+), 55 deletions(-)

diff --git a/src/eva/models/config.py b/src/eva/models/config.py
index 2ed25490..34ac1ae6 100644
--- a/src/eva/models/config.py
+++ b/src/eva/models/config.py
@@ -300,6 +300,21 @@ class RunConfig(BaseSettings):
         "EVA_METRICS_TO_RUN": "EVA_METRICS",
     }
 
+    # Providers that manage their own model/key resolution (e.g. WebSocket-based)
+    _SKIP_PARAMS_VALIDATION: ClassVar[set[str]] = {"nvidia"}
+
+    # Maps *_params field names to their provider field for env override logic
+    _PARAMS_TO_PROVIDER: ClassVar[dict[str, str]] = {
+        "stt_params": "stt",
+        "tts_params": "tts",
+        "s2s_params": "s2s",
+        "audio_llm_params": "audio_llm",
+    }
+    # Keys always read from the live environment (not persisted across runs)
+    _ENV_OVERRIDE_KEYS: ClassVar[set[str]] = {"url", "urls"}
+    # Substrings that identify secret keys (redacted in logs and config.json)
+    _SECRET_KEY_PATTERNS: ClassVar[set[str]] = {"key", "credentials", "secret"}
+
     class ModelDeployment(DeploymentTypedDict):
         """DeploymentTypedDict that preserves extra keys in litellm_params."""
 
@@ -491,9 +506,6 @@ def _check_companion_services(self) -> "RunConfig":
 
         return self
 
-    # Providers that manage their own model/key resolution (e.g. WebSocket-based)
-    _SKIP_PARAMS_VALIDATION: ClassVar[set[str]] = {"nvidia"}
-
     @classmethod
     def _validate_service_params(cls, service: str, provider: str, params: dict[str, Any]) -> None:
         """Validate that STT/TTS params contain required keys."""
@@ -555,58 +567,65 @@ def _redact_model_params(cls, model: ModelConfigUnion) -> dict:
                         value[key] = "***"
         return data
 
-    def restore_redacted_secrets(self, live: "RunConfig") -> None:
-        """Replace ``***`` values in this config with real values from *live*.
+    def apply_env_overrides(self, live: "RunConfig") -> None:
+        """Apply environment-dependent values from *live* config onto this (saved) config.
 
-        Covers both ``model.*_params`` (STT/TTS/S2S/AudioLLM secrets) and
-        ``model_list[].litellm_params`` (LLM deployment secrets).
+        Restores redacted secrets (``***``) and overrides dynamic fields (``url``,
+        ``urls``) in ``model.*_params`` and ``model_list[].litellm_params``.
 
         Raises:
             ValueError: If provider or alias differs for a service with redacted secrets.
         """
         # ── model.*_params (STT / TTS / S2S / AudioLLM) ──
-        _PARAMS_TO_PROVIDER = {
-            "stt_params": "stt",
-            "tts_params": "tts",
-            "s2s_params": "s2s",
-            "audio_llm_params": "audio_llm",
-        }
-        for params_field, provider_field in _PARAMS_TO_PROVIDER.items():
+        for params_field, provider_field in self._PARAMS_TO_PROVIDER.items():
             saved = getattr(self.model, params_field, None)
             source = getattr(live.model, params_field, None)
             if not isinstance(saved, dict) or not isinstance(source, dict):
                 continue
-            if not any(v == "***" for v in saved.values()):
-                continue
-
-            saved_provider = getattr(self.model, provider_field, None)
-            live_provider = getattr(live.model, provider_field, None)
-            if saved_provider != live_provider:
-                raise ValueError(
-                    f"Cannot restore secrets: saved {provider_field}={saved_provider!r} "
-                    f"but current environment has {provider_field}={live_provider!r}"
-                )
-
-            saved_alias = saved.get("alias")
-            live_alias = source.get("alias")
-            if saved_alias and live_alias and saved_alias != live_alias:
-                raise ValueError(
-                    f"Cannot restore secrets: saved {params_field}[alias]={saved_alias!r} "
-                    f"but current environment has {params_field}[alias]={live_alias!r}"
-                )
 
-            saved_model = saved.get("model")
-            live_model = source.get("model")
-            if saved_model and live_model and saved_model != live_model:
-                logger.warning(
-                    "Model mismatch for %s: saved %r, current environment has %r",
-                    params_field,
-                    saved_model,
-                    live_model,
-                )
+            has_redacted = any(v == "***" for v in saved.values())
+            has_env_overrides = any(k in saved or k in source for k in self._ENV_OVERRIDE_KEYS)
+            if not has_redacted and not has_env_overrides:
+                continue
 
-            for key, value in saved.items():
-                if value == "***" and key in source:
+            if has_redacted:
+                saved_provider = getattr(self.model, provider_field, None)
+                live_provider = getattr(live.model, provider_field, None)
+                if saved_provider != live_provider:
+                    raise ValueError(
+                        f"Cannot restore secrets: saved {provider_field}={saved_provider!r} "
+                        f"but current environment has {provider_field}={live_provider!r}"
+                    )
+
+                saved_alias = saved.get("alias")
+                live_alias = source.get("alias")
+                if saved_alias and live_alias and saved_alias != live_alias:
+                    raise ValueError(
+                        f"Cannot restore secrets: saved {params_field}[alias]={saved_alias!r} "
+                        f"but current environment has {params_field}[alias]={live_alias!r}"
+                    )
+
+                saved_model = saved.get("model")
+                live_model = source.get("model")
+                if saved_model and live_model and saved_model != live_model:
+                    logger.warning(
+                        f"Model mismatch for {params_field}: saved {saved_model!r}, "
+                        f"current environment has {live_model!r}"
+                    )
+
+                for key, value in saved.items():
+                    if value == "***" and key in source:
+                        saved[key] = source[key]
+
+            # Always use url/urls from the live environment
+            for key in self._ENV_OVERRIDE_KEYS:
+                if key in source:
+                    saved_val = saved.get(key)
+                    if saved_val and saved_val != source[key]:
+                        logger.warning(
+                            f"{params_field}[{key}] differs: saved {saved_val!r}, "
+                            f"using {source[key]!r} from current environment"
+                        )
                     saved[key] = source[key]
 
         # ── model_list[].litellm_params (LLM deployments) ──
@@ -629,6 +648,21 @@ def restore_redacted_secrets(self, live: "RunConfig") -> None:
                 if value == "***" and key in live_params:
                     saved_params[key] = live_params[key]
 
+        # ── Log resolved configuration ──
+        def _safe_params(p: dict) -> dict:
+            return {k: "***" if any(s in k for s in self._SECRET_KEY_PATTERNS) else v for k, v in p.items()}
+
+        for params_field, provider_field in self._PARAMS_TO_PROVIDER.items():
+            params = getattr(self.model, params_field, None)
+            provider = getattr(self.model, provider_field, None)
+            if isinstance(params, dict) and params:
+                logger.info(f"Resolved {provider_field} ({provider}): {_safe_params(params)}")
+
+        for deployment in self.model_list:
+            name = deployment.get("model_name", "?")
+            params = deployment.get("litellm_params", {})
+            logger.info(f"Resolved deployment {name}: {_safe_params(params)}")
+
     @classmethod
     def from_yaml(cls, path: Path | str) -> "RunConfig":
         """Load configuration from YAML file."""
diff --git a/src/eva/run_benchmark.py b/src/eva/run_benchmark.py
index 78a66843..49096448 100644
--- a/src/eva/run_benchmark.py
+++ b/src/eva/run_benchmark.py
@@ -42,8 +42,8 @@ async def run_benchmark(config: RunConfig) -> int:
             logger.error(str(e))
             return 1
 
-        # Restore secrets redacted in config.json with live env values
-        runner.config.restore_redacted_secrets(config)
+        # Apply env-dependent values (secrets, urls) from live env onto saved config
+        runner.config.apply_env_overrides(config)
 
         # Apply CLI overrides
         runner.config.max_rerun_attempts = config.max_rerun_attempts
diff --git a/tests/unit/models/test_config_models.py b/tests/unit/models/test_config_models.py
index 7b00573e..4a2c22e4 100644
--- a/tests/unit/models/test_config_models.py
+++ b/tests/unit/models/test_config_models.py
@@ -185,7 +185,7 @@ def test_secrets_redaction_does_not_mutate_live_config(self):
         assert config.model.stt_params["api_key"] == "test_key"
         assert config.model.tts_params["api_key"] == "test_key"
 
-    def test_restore_redacted_secrets(self):
+    def test_apply_env_overrides(self):
         """Redacted secrets are restored from a live config for both model and model_list."""
         config = _config(env_vars=_BASE_ENV)
         dumped_json = config.model_dump_json()
@@ -198,7 +198,7 @@ def test_restore_redacted_secrets(self):
         assert loaded.model_list[1]["litellm_params"]["vertex_credentials"] == "***"
         assert loaded.model_list[2]["litellm_params"]["aws_access_key_id"] == "***"
 
-        loaded.restore_redacted_secrets(config)
+        loaded.apply_env_overrides(config)
 
         # STT/TTS params restored
         assert loaded.model.stt_params["api_key"] == "test_key"
@@ -210,7 +210,7 @@ def test_restore_redacted_secrets(self):
         assert loaded.model_list[2]["litellm_params"]["aws_access_key_id"] == "must_be_redacted"
         assert loaded.model_list[2]["litellm_params"]["aws_secret_access_key"] == "must_be_redacted"
 
-    def test_restore_redacted_secrets_provider_mismatch(self):
+    def test_apply_env_overrides_provider_mismatch(self):
         """Restoring secrets fails if the STT/TTS provider changed."""
         config = _config(env_vars=_BASE_ENV)
         dumped_json = config.model_dump_json()
@@ -224,9 +224,9 @@ def test_restore_redacted_secrets_provider_mismatch(self):
             }
         )
         with pytest.raises(ValueError, match=r"saved stt='deepgram'.*current environment has stt='openai_whisper'"):
-            loaded.restore_redacted_secrets(live)
+            loaded.apply_env_overrides(live)
 
-    def test_restore_redacted_secrets_alias_mismatch(self):
+    def test_apply_env_overrides_alias_mismatch(self):
         """Restoring secrets fails if the alias changed."""
         config = _config(
             env_vars=_BASE_ENV
@@ -247,9 +247,9 @@ def test_restore_redacted_secrets_alias_mismatch(self):
             ValueError,
             match=r"saved stt_params\[alias\]='stt-v1'.*current environment has stt_params\[alias\]='stt-v2'",
         ):
-            loaded.restore_redacted_secrets(live)
+            loaded.apply_env_overrides(live)
 
-    def test_restore_redacted_secrets_model_mismatch_warns(self, caplog):
+    def test_apply_env_overrides_model_mismatch_warns(self, caplog):
         """Restoring secrets warns (but succeeds) if the STT/TTS model changed."""
         config = _config(env_vars=_BASE_ENV)
         dumped_json = config.model_dump_json()
@@ -257,12 +257,48 @@ def test_restore_redacted_secrets_model_mismatch_warns(self, caplog):
 
         live = _config(env_vars=_BASE_ENV | {"EVA_MODEL__TTS_PARAMS": json.dumps({"api_key": "k", "model": "sonic-2"})})
         with caplog.at_level("WARNING", logger="eva.models.config"):
-            loaded.restore_redacted_secrets(live)
+            loaded.apply_env_overrides(live)
         assert "sonic" in caplog.text
         assert "sonic-2" in caplog.text
         assert loaded.model.tts_params["api_key"] == "k"
 
-    def test_restore_redacted_secrets_llm_deployment_mismatch(self):
+    def test_apply_env_overrides_url_from_env(self, caplog):
+        """Url is always taken from the live env, with a warning if it differs."""
+        saved_env = _BASE_ENV | {
+            "EVA_MODEL__STT_PARAMS": json.dumps({"api_key": "k", "model": "nova-2", "url": "wss://old-host/stt"}),
+        }
+        config = _config(env_vars=saved_env)
+        dumped_json = config.model_dump_json()
+        loaded = RunConfig.model_validate_json(dumped_json)
+
+        # Live env has a different url
+        live_env = _BASE_ENV | {
+            "EVA_MODEL__STT_PARAMS": json.dumps({"api_key": "k", "model": "nova-2", "url": "wss://new-host/stt"}),
+        }
+        live = _config(env_vars=live_env)
+
+        with caplog.at_level("WARNING", logger="eva.models.config"):
+            loaded.apply_env_overrides(live)
+
+        assert loaded.model.stt_params["url"] == "wss://new-host/stt"
+        assert "wss://old-host/stt" in caplog.text
+        assert "wss://new-host/stt" in caplog.text
+
+    def test_apply_env_overrides_url_added_from_env(self):
+        """Url from live env is added even if the saved config didn't have one."""
+        config = _config(env_vars=_BASE_ENV)
+        dumped_json = config.model_dump_json()
+        loaded = RunConfig.model_validate_json(dumped_json)
+
+        live_env = _BASE_ENV | {
+            "EVA_MODEL__STT_PARAMS": json.dumps({"api_key": "k", "model": "nova-2", "url": "wss://new-host/stt"}),
+        }
+        live = _config(env_vars=live_env)
+        loaded.apply_env_overrides(live)
+
+        assert loaded.model.stt_params["url"] == "wss://new-host/stt"
+
+    def test_apply_env_overrides_llm_deployment_mismatch(self):
         """Restoring secrets fails if a saved LLM deployment is missing from the live model_list."""
         config = _config(env_vars=_BASE_ENV)
         dumped_json = config.model_dump_json()
@@ -286,7 +322,7 @@ def test_restore_redacted_secrets_llm_deployment_mismatch(self):
             }
         )
         with pytest.raises(ValueError, match=r"deployment 'gpt-5.2' not found in current EVA_MODEL_LIST"):
-            loaded.restore_redacted_secrets(live)
+            loaded.apply_env_overrides(live)
 
     @pytest.mark.parametrize(
         "environ, expected_exception, expected_message",

From d366872fa694814a7b0f0f822448442753bbb982 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gabrielle=20Gauthier=20Melanc=CC=A7on?=
 <gabrielle.gm@servicenow.com>
Date: Tue, 31 Mar 2026 14:44:56 -0400
Subject: [PATCH 17/25] Address feebdack

---
 src/eva/models/config.py | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/src/eva/models/config.py b/src/eva/models/config.py
index 34ac1ae6..a5022341 100644
--- a/src/eva/models/config.py
+++ b/src/eva/models/config.py
@@ -540,6 +540,16 @@ def _expand_metrics_all(cls, v: list[str] | None) -> list[str] | None:
             return [m for m in get_global_registry().list_metrics() if m not in cls._VALIDATION_METRIC_NAMES]
         return v
 
+    @classmethod
+    def _is_secret_key(cls, key: str) -> bool:
+        """Return True if *key* matches any pattern in _SECRET_KEY_PATTERNS."""
+        return any(pattern in key for pattern in cls._SECRET_KEY_PATTERNS)
+
+    @classmethod
+    def _redact_dict(cls, params: dict) -> dict:
+        """Return a copy of *params* with secret values replaced by ``***``."""
+        return {k: "***" if cls._is_secret_key(k) else v for k, v in params.items()}
+
     @field_serializer("model_list")
     @classmethod
     def _redact_model_list(cls, deployments: list[ModelDeployment]) -> list[dict]:
@@ -548,10 +558,7 @@ def _redact_model_list(cls, deployments: list[ModelDeployment]) -> list[dict]:
         for deployment in deployments:
             deployment = copy.deepcopy(deployment)
             if "litellm_params" in deployment:
-                params = deployment["litellm_params"]
-                for key in params:
-                    if "key" in key or "credentials" in key:
-                        params[key] = "***"
+                deployment["litellm_params"] = cls._redact_dict(deployment["litellm_params"])
             redacted.append(deployment)
         return redacted
 
@@ -562,9 +569,7 @@ def _redact_model_params(cls, model: ModelConfigUnion) -> dict:
         data = model.model_dump(mode="json")
         for field_name, value in data.items():
             if field_name.endswith("_params") and isinstance(value, dict):
-                for key in value:
-                    if "key" in key or "credentials" in key:
-                        value[key] = "***"
+                data[field_name] = cls._redact_dict(value)
         return data
 
     def apply_env_overrides(self, live: "RunConfig") -> None:
@@ -649,19 +654,16 @@ def apply_env_overrides(self, live: "RunConfig") -> None:
                     saved_params[key] = live_params[key]
 
         # ── Log resolved configuration ──
-        def _safe_params(p: dict) -> dict:
-            return {k: "***" if any(s in k for s in self._SECRET_KEY_PATTERNS) else v for k, v in p.items()}
-
         for params_field, provider_field in self._PARAMS_TO_PROVIDER.items():
             params = getattr(self.model, params_field, None)
             provider = getattr(self.model, provider_field, None)
             if isinstance(params, dict) and params:
-                logger.info(f"Resolved {provider_field} ({provider}): {_safe_params(params)}")
+                logger.info(f"Resolved {provider_field} ({provider}): {self._redact_dict(params)}")
 
         for deployment in self.model_list:
             name = deployment.get("model_name", "?")
             params = deployment.get("litellm_params", {})
-            logger.info(f"Resolved deployment {name}: {_safe_params(params)}")
+            logger.info(f"Resolved deployment {name}: {self._redact_dict(params)}")
 
     @classmethod
     def from_yaml(cls, path: Path | str) -> "RunConfig":

From 86adbc36692bf90f1b396dc31f9bff38253d0694 Mon Sep 17 00:00:00 2001
From: "joseph.marinier" <joseph.marinier@servicenow.com>
Date: Tue, 31 Mar 2026 16:08:11 -0400
Subject: [PATCH 18/25] Explain `run_id` default value in `eva --help`

---
 src/eva/models/config.py | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/src/eva/models/config.py b/src/eva/models/config.py
index a5022341..6022829b 100644
--- a/src/eva/models/config.py
+++ b/src/eva/models/config.py
@@ -39,10 +39,6 @@
 logger = logging.getLogger(__name__)
 
 
-def current_date_and_time():
-    return f"{datetime.now(UTC):%Y-%m-%d_%H-%M-%S.%f}"
-
-
 def _param_alias(params: dict[str, Any]) -> str:
     """Return the display alias from a params dict."""
     return params.get("alias") or params.get("model") or ""
@@ -329,7 +325,7 @@ class ModelDeployment(DeploymentTypedDict):
 
     # Run identifier
     run_id: str = Field(
-        default_factory=current_date_and_time,
+        "timestamp and model name(s)",  # Overwritten by _set_default_run_id()
         description="Run identifier, auto-generated if not provided",
     )
 
@@ -498,12 +494,13 @@ def _check_companion_services(self) -> "RunConfig":
             if not self.model.tts:
                 raise ValueError("EVA_MODEL__TTS is required when using EVA_MODEL__AUDIO_LLM (SpeechLM-TTS pipeline).")
             self._validate_service_params("TTS", self.model.tts, self.model.tts_params)
+        return self
 
-        # Append model names to auto-generated run_id
+    @model_validator(mode="after")
+    def _set_default_run_id(self) -> "RunConfig":
         if "run_id" not in self.model_fields_set:
             suffix = "_".join(v for v in self.model.pipeline_parts.values() if v)
-            self.run_id = f"{self.run_id}_{suffix}"
-
+            self.run_id = f"{datetime.now(UTC):%Y-%m-%d_%H-%M-%S.%f}_{suffix}"
         return self
 
     @classmethod

From 96674b9253319f7ba128ebe5b178e0b67d317111 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gabrielle=20Gauthier=20Melanc=CC=A7on?=
 <gabrielle.gm@servicenow.com>
Date: Wed, 1 Apr 2026 17:23:52 -0400
Subject: [PATCH 19/25] Address feebdack

---
 src/eva/models/config.py | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/eva/models/config.py b/src/eva/models/config.py
index 6022829b..c37675c9 100644
--- a/src/eva/models/config.py
+++ b/src/eva/models/config.py
@@ -41,7 +41,7 @@
 
 def _param_alias(params: dict[str, Any]) -> str:
     """Return the display alias from a params dict."""
-    return params.get("alias") or params.get("model") or ""
+    return params.get("alias") or params["model"]
 
 
 class PipelineConfig(BaseModel):
@@ -82,9 +82,9 @@ class PipelineConfig(BaseModel):
     def pipeline_parts(self) -> dict[str, str]:
         """Component names for this pipeline."""
         return {
-            "stt": _param_alias(self.stt_params) or self.stt or "",
+            "stt": _param_alias(self.stt_params) or self.stt,
             "llm": self.llm,
-            "tts": _param_alias(self.tts_params) or self.tts or "",
+            "tts": _param_alias(self.tts_params) or self.tts,
         }
 
     @model_validator(mode="before")
@@ -142,7 +142,7 @@ def pipeline_parts(self) -> dict[str, str]:
         """Component names for this pipeline."""
         return {
             "audio_llm": _param_alias(self.audio_llm_params) or self.audio_llm,
-            "tts": _param_alias(self.tts_params) or self.tts or "",
+            "tts": _param_alias(self.tts_params) or self.tts,
         }
 
 
@@ -591,14 +591,6 @@ def apply_env_overrides(self, live: "RunConfig") -> None:
                 continue
 
             if has_redacted:
-                saved_provider = getattr(self.model, provider_field, None)
-                live_provider = getattr(live.model, provider_field, None)
-                if saved_provider != live_provider:
-                    raise ValueError(
-                        f"Cannot restore secrets: saved {provider_field}={saved_provider!r} "
-                        f"but current environment has {provider_field}={live_provider!r}"
-                    )
-
                 saved_alias = saved.get("alias")
                 live_alias = source.get("alias")
                 if saved_alias and live_alias and saved_alias != live_alias:
@@ -607,6 +599,14 @@ def apply_env_overrides(self, live: "RunConfig") -> None:
                         f"but current environment has {params_field}[alias]={live_alias!r}"
                     )
 
+                saved_provider = getattr(self.model, provider_field, None)
+                live_provider = getattr(live.model, provider_field, None)
+                if saved_provider != live_provider:
+                    logger.warning(
+                        f"Provider mismatch for {params_field}: saved {saved_provider!r}, "
+                        f"current environment has {live_provider!r}"
+                    )
+
                 saved_model = saved.get("model")
                 live_model = source.get("model")
                 if saved_model and live_model and saved_model != live_model:

From 5facc71b41bfcbec51f7e27d7b124cc9d35da843 Mon Sep 17 00:00:00 2001
From: JosephMarinier <8386369+JosephMarinier@users.noreply.github.com>
Date: Tue, 7 Apr 2026 20:12:20 +0000
Subject: [PATCH 20/25] Apply pre-commit

---
 src/eva/models/config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/eva/models/config.py b/src/eva/models/config.py
index 6f98a029..b69a2beb 100644
--- a/src/eva/models/config.py
+++ b/src/eva/models/config.py
@@ -121,7 +121,7 @@ class SpeechToSpeechConfig(BaseModel):
             "Set via EVA_MODEL__TURN_STRATEGY=external."
         ),
     )
-    
+
     @property
     def pipeline_parts(self) -> dict[str, str]:
         """Component names for this pipeline."""

From a4caf5d5365ae8132312412cf5b46d790ae200fb Mon Sep 17 00:00:00 2001
From: "joseph.marinier" <joseph.marinier@servicenow.com>
Date: Tue, 7 Apr 2026 19:15:52 -0400
Subject: [PATCH 21/25] Adapt test_apply_env_overrides_provider_mismatch

---
 tests/unit/models/test_config_models.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tests/unit/models/test_config_models.py b/tests/unit/models/test_config_models.py
index 99424e22..110d9ff4 100644
--- a/tests/unit/models/test_config_models.py
+++ b/tests/unit/models/test_config_models.py
@@ -214,8 +214,8 @@ def test_apply_env_overrides(self):
         assert loaded.model_list[2]["litellm_params"]["aws_access_key_id"] == "must_be_redacted"
         assert loaded.model_list[2]["litellm_params"]["aws_secret_access_key"] == "must_be_redacted"
 
-    def test_apply_env_overrides_provider_mismatch(self):
-        """Restoring secrets fails if the STT/TTS provider changed."""
+    def test_apply_env_overrides_provider_mismatch(self, caplog):
+        """Restoring secrets warns (but succeeds) if the STT/TTS provider changed."""
         config = _config(env_vars=_BASE_ENV)
         dumped_json = config.model_dump_json()
         loaded = RunConfig.model_validate_json(dumped_json)
@@ -227,8 +227,9 @@ def test_apply_env_overrides_provider_mismatch(self):
                 "EVA_MODEL__STT_PARAMS": json.dumps({"api_key": "k", "model": "whisper-1"}),
             }
         )
-        with pytest.raises(ValueError, match=r"saved stt='deepgram'.*current environment has stt='openai_whisper'"):
+        with caplog.at_level("WARNING", logger="eva.models.config"):
             loaded.apply_env_overrides(live)
+        assert "saved 'deepgram', current environment has 'openai_whisper'" in caplog.text
 
     def test_apply_env_overrides_alias_mismatch(self):
         """Restoring secrets fails if the alias changed."""

From c30bf0796de9fb4d35b8224d6a97b6ec89f5422f Mon Sep 17 00:00:00 2001
From: "joseph.marinier" <joseph.marinier@servicenow.com>
Date: Tue, 7 Apr 2026 19:16:21 -0400
Subject: [PATCH 22/25] Run tests in all PRs, no matter the branch

---
 .github/workflows/tests.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
index 2d00e8ff..ad62d66d 100644
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@@ -3,7 +3,6 @@ name: Tests
 on:
   merge_group:
   pull_request:
-    branches: [main]
 
 jobs:
   test:

From cc864d474926d52f87bfcac01fa701f829c097d5 Mon Sep 17 00:00:00 2001
From: "joseph.marinier" <joseph.marinier@servicenow.com>
Date: Wed, 8 Apr 2026 11:32:50 -0400
Subject: [PATCH 23/25] Remove Markdown alerts inside details

as they don't work in GitHub.
---
 README.md | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 0ba0d575..848c3784 100644
--- a/README.md
+++ b/README.md
@@ -30,8 +30,7 @@ Agents that score well on task completion tend to score worse on conversational
 
 We recommend using [uv](https://docs.astral.sh/uv/) for fast, reliable dependency management. If you don't have `uv` installed, see the [uv installation guide](https://docs.astral.sh/uv/getting-started/installation/).
 
-> [!NOTE]
-> This project requires **Python 3.11–3.13** (set via `requires-python` in `pyproject.toml`). `uv` will automatically select a compatible version. If you're using pip, make sure you're running a supported Python version.
+This project requires **Python 3.11–3.13** (set via `requires-python` in `pyproject.toml`). `uv` will automatically select a compatible version. If you're using pip, make sure you're running a supported Python version.
 
 ```bash
 # Clone the repository
@@ -46,18 +45,16 @@ cp .env.example .env
 # Edit .env with your API keys (ELEVENLABS_API_KEY, OPENAI_API_KEY required)
 ```
 
-> [!TIP]
-> After installation, you can run EVA using either:
-> - `eva` — CLI entry point (e.g., `eva --help`)
-> - `python main.py` — script at the repo root (e.g., `python main.py --help`)
->
-> If using an IDE, point your Python interpreter to `.venv/bin/python` so commands run in the virtual environment automatically. Otherwise, prefix commands with `uv run` or activate the environment with `source .venv/bin/activate`.
+After installation, you can run EVA using either:
+- `eva` — CLI entry point (e.g., `eva --help`)
+- `python main.py` — script at the repo root (e.g., `python main.py --help`)
+
+If using an IDE, point your Python interpreter to `.venv/bin/python` so commands run in the virtual environment automatically. Otherwise, prefix commands with `uv run` or activate the environment with `source .venv/bin/activate`.
 
 <details>
 <summary>Alternative: using pip</summary>
 
-> [!NOTE]
-> This project requires Python 3.11. If you need to manage multiple Python versions, consider using [pyenv](https://github.com/pyenv/pyenv).
+This project requires Python 3.11. If you need to manage multiple Python versions, consider using [pyenv](https://github.com/pyenv/pyenv).
 
 ```bash
 # Create and activate a virtual environment

From 3e2cc30cdf9889e51ab7f43202066821c27fb2a3 Mon Sep 17 00:00:00 2001
From: "joseph.marinier" <joseph.marinier@servicenow.com>
Date: Wed, 8 Apr 2026 11:33:54 -0400
Subject: [PATCH 24/25] Document cloning latest tag

---
 README.md | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 848c3784..2e492373 100644
--- a/README.md
+++ b/README.md
@@ -26,6 +26,18 @@ Agents that score well on task completion tend to score worse on conversational
 <details>
 <summary><h2>Quick Start</h2></summary>
 
+### Cloning the Repository
+
+If you're only interested in running the latest stable version of EVA, you can clone with `--branch latest`, and optionally speed things up with `--depth 1 --no-tags --single-branch`.
+```bash
+git clone https://github.com/ServiceNow/eva.git --branch latest --depth 1 --no-tags --single-branch
+```
+
+Otherwise, for development, you can clone the default branch, `main`.
+```bash
+git clone https://github.com/ServiceNow/eva.git
+```
+
 ### Installation
 
 We recommend using [uv](https://docs.astral.sh/uv/) for fast, reliable dependency management. If you don't have `uv` installed, see the [uv installation guide](https://docs.astral.sh/uv/getting-started/installation/).
@@ -33,8 +45,6 @@ We recommend using [uv](https://docs.astral.sh/uv/) for fast, reliable dependenc
 This project requires **Python 3.11–3.13** (set via `requires-python` in `pyproject.toml`). `uv` will automatically select a compatible version. If you're using pip, make sure you're running a supported Python version.
 
 ```bash
-# Clone the repository
-git clone https://github.com/ServiceNow/eva.git
 cd eva
 
 # Install all dependencies (uv automatically creates a virtual environment)

From 6d771db15259996e20ae06bcd69274aca24d7a91 Mon Sep 17 00:00:00 2001
From: Katrina <katrina.stankiewicz@servicenow.com>
Date: Wed, 8 Apr 2026 12:19:15 -0400
Subject: [PATCH 25/25] make model required for all services

---
 src/eva/assistant/pipeline/services.py  | 22 +++++---
 src/eva/models/config.py                | 11 ++--
 tests/unit/models/test_config_models.py | 67 +++++++++++++------------
 3 files changed, 53 insertions(+), 47 deletions(-)

diff --git a/src/eva/assistant/pipeline/services.py b/src/eva/assistant/pipeline/services.py
index 1b735824..c8ee3eff 100644
--- a/src/eva/assistant/pipeline/services.py
+++ b/src/eva/assistant/pipeline/services.py
@@ -412,19 +412,22 @@ def create_realtime_llm_service(
     if model_lower.startswith("openai"):
         session_properties = get_openai_session_properties(system_prompt, params, pipecat_tools)
         if audit_log is not None:
-            logger.info(
-                f"Using InstrumentedRealtimeLLMService for audit log interception: openai: {params.get('model')}"
-            )
+            logger.info(f"Using InstrumentedRealtimeLLMService for audit log interception: openai: {params['model']}")
             return InstrumentedRealtimeLLMService(
-                model=params.get("model"),
+                settings=OpenAIRealtimeLLMService.Settings(
+                    model=params["model"],
+                    session_properties=session_properties,
+                ),
                 audit_log=audit_log,
                 api_key=params["api_key"],
-                session_properties=session_properties,
             )
 
         return OpenAIRealtimeLLMService(
             api_key=params["api_key"],
-            session_properties=session_properties,
+            settings=OpenAIRealtimeLLMService.Settings(
+                model=params["model"],
+                session_properties=session_properties,
+            ),
         )
     elif model_lower.startswith("azure") or model_lower.startswith("gpt-realtime"):
         #
@@ -438,17 +441,21 @@ def create_realtime_llm_service(
         if audit_log is not None:
             logger.info("Using InstrumentedRealtimeLLMService for audit log interception")
             service = InstrumentedRealtimeLLMService(
-                model=params.get("model"),
                 audit_log=audit_log,
                 api_key=params["api_key"],
                 base_url=url,
                 session_properties=session_properties,
+                settings=OpenAIRealtimeLLMService.Settings(
+                    model=params["model"],
+                    session_properties=session_properties,
+                ),
             )
             InstrumentedRealtimeLLMService._connect = override__connect  # azure realtime connect
             return service
 
         return OpenAIRealtimeLLMService(
             api_key=params["api_key"],
+            model=params["model"],
             base_url=url,
             session_properties=session_properties,
         )
@@ -461,6 +468,7 @@ def create_realtime_llm_service(
                 temperature=0.3,
                 max_duration=datetime.timedelta(minutes=6),
                 voice=params.get("voice", "03e20d03-35e4-43c4-bb18-9b18a2cd3086"),
+                model=params["model"],
             ),
             one_shot_selected_tools=pipecat_tools,
         )
diff --git a/src/eva/models/config.py b/src/eva/models/config.py
index b69a2beb..e08783bd 100644
--- a/src/eva/models/config.py
+++ b/src/eva/models/config.py
@@ -82,9 +82,9 @@ class PipelineConfig(BaseModel):
     def pipeline_parts(self) -> dict[str, str]:
         """Component names for this pipeline."""
         return {
-            "stt": _param_alias(self.stt_params) or self.stt,
+            "stt": _param_alias(self.stt_params),
             "llm": self.llm,
-            "tts": _param_alias(self.tts_params) or self.tts,
+            "tts": _param_alias(self.tts_params),
         }
 
     @model_validator(mode="before")
@@ -307,9 +307,6 @@ class RunConfig(BaseSettings):
         "EVA_METRICS_TO_RUN": "EVA_METRICS",
     }
 
-    # Providers that manage their own model/key resolution (e.g. WebSocket-based)
-    _SKIP_PARAMS_VALIDATION: ClassVar[set[str]] = {"nvidia"}
-
     # Maps *_params field names to their provider field for env override logic
     _PARAMS_TO_PROVIDER: ClassVar[dict[str, str]] = {
         "stt_params": "stt",
@@ -503,7 +500,7 @@ def _check_companion_services(self) -> "RunConfig":
             self._validate_service_params("audio_llm", self.model.audio_llm, required_keys, self.model.audio_llm_params)
         elif isinstance(self.model, SpeechToSpeechConfig):
             # api_key is required, some s2s services don't require model
-            self._validate_service_params("S2S", self.model.s2s, ["api_key"], self.model.s2s_params)
+            self._validate_service_params("S2S", self.model.s2s, required_keys, self.model.s2s_params)
         return self
 
     @model_validator(mode="after")
@@ -518,8 +515,6 @@ def _validate_service_params(
         cls, service: str, provider: str, required_keys: list[str], params: dict[str, Any]
     ) -> None:
         """Validate that STT/TTS params contain required keys."""
-        if provider.lower() in cls._SKIP_PARAMS_VALIDATION:
-            return
         missing = [key for key in required_keys if key not in params]
         if missing:
             missing_str = " and ".join(f'"{k}"' for k in missing)
diff --git a/tests/unit/models/test_config_models.py b/tests/unit/models/test_config_models.py
index 99424e22..50f22c73 100644
--- a/tests/unit/models/test_config_models.py
+++ b/tests/unit/models/test_config_models.py
@@ -57,7 +57,7 @@
 }
 _S2S_ENV = _EVA_MODEL_LIST_ENV | {
     "EVA_MODEL__S2S": "gpt-realtime-mini",
-    "EVA_MODEL__S2S_PARAMS": json.dumps({"api_key": ""}),
+    "EVA_MODEL__S2S_PARAMS": json.dumps({"api_key": "", "model": "test"}),
 }
 
 
@@ -77,6 +77,12 @@ def _config(
         return RunConfig(_env_file=env_file, _cli_parse_args=cli_args, **kwargs)
 
 
+def _load_json_into_runconfig(json_str: str) -> RunConfig:
+    """Load RunConfig from JSON with isolated environment (no real env vars)."""
+    with patch.dict(os.environ, {}, clear=True):
+        return RunConfig.model_validate_json(json_str)
+
+
 class TestRunConfig:
     def test_create_minimal_config(self):
         """Test creating a minimal RunConfig."""
@@ -193,7 +199,7 @@ def test_apply_env_overrides(self):
         """Redacted secrets are restored from a live config for both model and model_list."""
         config = _config(env_vars=_BASE_ENV)
         dumped_json = config.model_dump_json()
-        loaded = RunConfig.model_validate_json(dumped_json)
+        loaded = _load_json_into_runconfig(dumped_json)
 
         # Everything is redacted after round-trip
         assert loaded.model.stt_params["api_key"] == "***"
@@ -214,11 +220,11 @@ def test_apply_env_overrides(self):
         assert loaded.model_list[2]["litellm_params"]["aws_access_key_id"] == "must_be_redacted"
         assert loaded.model_list[2]["litellm_params"]["aws_secret_access_key"] == "must_be_redacted"
 
-    def test_apply_env_overrides_provider_mismatch(self):
-        """Restoring secrets fails if the STT/TTS provider changed."""
+    def test_apply_env_overrides_provider_mismatch(self, caplog):
+        """Restoring secrets warns (but succeeds) if the STT/TTS provider changed."""
         config = _config(env_vars=_BASE_ENV)
         dumped_json = config.model_dump_json()
-        loaded = RunConfig.model_validate_json(dumped_json)
+        loaded = _load_json_into_runconfig(dumped_json)
 
         live = _config(
             env_vars=_BASE_ENV
@@ -227,8 +233,11 @@ def test_apply_env_overrides_provider_mismatch(self):
                 "EVA_MODEL__STT_PARAMS": json.dumps({"api_key": "k", "model": "whisper-1"}),
             }
         )
-        with pytest.raises(ValueError, match=r"saved stt='deepgram'.*current environment has stt='openai_whisper'"):
+        with caplog.at_level("WARNING", logger="eva.models.config"):
             loaded.apply_env_overrides(live)
+        assert "Provider mismatch for stt_params" in caplog.text
+        assert "deepgram" in caplog.text
+        assert "openai_whisper" in caplog.text
 
     def test_apply_env_overrides_alias_mismatch(self):
         """Restoring secrets fails if the alias changed."""
@@ -239,7 +248,7 @@ def test_apply_env_overrides_alias_mismatch(self):
             }
         )
         dumped_json = config.model_dump_json()
-        loaded = RunConfig.model_validate_json(dumped_json)
+        loaded = _load_json_into_runconfig(dumped_json)
 
         live = _config(
             env_vars=_BASE_ENV
@@ -257,7 +266,7 @@ def test_apply_env_overrides_model_mismatch_warns(self, caplog):
         """Restoring secrets warns (but succeeds) if the STT/TTS model changed."""
         config = _config(env_vars=_BASE_ENV)
         dumped_json = config.model_dump_json()
-        loaded = RunConfig.model_validate_json(dumped_json)
+        loaded = _load_json_into_runconfig(dumped_json)
 
         live = _config(env_vars=_BASE_ENV | {"EVA_MODEL__TTS_PARAMS": json.dumps({"api_key": "k", "model": "sonic-2"})})
         with caplog.at_level("WARNING", logger="eva.models.config"):
@@ -273,7 +282,7 @@ def test_apply_env_overrides_url_from_env(self, caplog):
         }
         config = _config(env_vars=saved_env)
         dumped_json = config.model_dump_json()
-        loaded = RunConfig.model_validate_json(dumped_json)
+        loaded = _load_json_into_runconfig(dumped_json)
 
         # Live env has a different url
         live_env = _BASE_ENV | {
@@ -292,7 +301,7 @@ def test_apply_env_overrides_url_added_from_env(self):
         """Url from live env is added even if the saved config didn't have one."""
         config = _config(env_vars=_BASE_ENV)
         dumped_json = config.model_dump_json()
-        loaded = RunConfig.model_validate_json(dumped_json)
+        loaded = _load_json_into_runconfig(dumped_json)
 
         live_env = _BASE_ENV | {
             "EVA_MODEL__STT_PARAMS": json.dumps({"api_key": "k", "model": "nova-2", "url": "wss://new-host/stt"}),
@@ -306,7 +315,7 @@ def test_apply_env_overrides_llm_deployment_mismatch(self):
         """Restoring secrets fails if a saved LLM deployment is missing from the live model_list."""
         config = _config(env_vars=_BASE_ENV)
         dumped_json = config.model_dump_json()
-        loaded = RunConfig.model_validate_json(dumped_json)
+        loaded = _load_json_into_runconfig(dumped_json)
 
         # Live config has a different model_list (only one deployment, different name)
         different_model_list = [
@@ -447,20 +456,6 @@ def test_missing_stt_tts_params(self):
                 }
             )
 
-    def test_nvidia_stt_skips_params_validation(self):
-        """NVIDIA STT skips api_key/model validation (uses url-based config)."""
-        config = _config(
-            env_vars=_EVA_MODEL_LIST_ENV
-            | {
-                "EVA_MODEL__LLM": "gpt-5.2",
-                "EVA_MODEL__STT": "nvidia",
-                "EVA_MODEL__TTS": "cartesia",
-                "EVA_MODEL__STT_PARAMS": json.dumps({"url": "ws://localhost:8000"}),
-                "EVA_MODEL__TTS_PARAMS": json.dumps({"api_key": "k", "model": "sonic"}),
-            }
-        )
-        assert config.model.stt == "nvidia"
-
 
 class TestDefaults:
     """Verify default values match expectations."""
@@ -547,14 +542,14 @@ class TestDeprecatedEnvVars:
                 _S2S_ENV,
                 "REALTIME_MODEL_PARAMS",
                 "EVA_MODEL__S2S_PARAMS",
-                {"api_key": "k"},
+                {"api_key": "k", "model": "model"},
                 lambda c: c.model.s2s_params,
             ),
             (
                 _S2S_ENV,
                 "EVA_MODEL__REALTIME_MODEL_PARAMS",
                 "EVA_MODEL__S2S_PARAMS",
-                {"api_key": "k"},
+                {"api_key": "k", "model": "model"},
                 lambda c: c.model.s2s_params,
             ),
             (
@@ -816,7 +811,7 @@ def test_s2s_config_from_env(self):
             env_vars=_EVA_MODEL_LIST_ENV
             | {
                 "EVA_MODEL__S2S": "gpt-realtime-mini",
-                "EVA_MODEL__S2S_PARAMS": json.dumps({"api_key": ""}),
+                "EVA_MODEL__S2S_PARAMS": json.dumps({"api_key": "", "model": "gpt-realtime-mini"}),
             }
         )
         assert isinstance(config.model, SpeechToSpeechConfig)
@@ -826,17 +821,25 @@ def test_s2s_config_from_cli(self):
         """--s2s-model selects SpeechToSpeechConfig."""
         config = _config(
             env_vars=_EVA_MODEL_LIST_ENV,
-            cli_args=["--model.s2s", "gemini_live", "--model.s2s-params", '{"api_key": "test-key"}'],
+            cli_args=[
+                "--model.s2s",
+                "gemini_live",
+                "--model.s2s-params",
+                '{"api_key": "test-key", "model": "gemini_live"}',
+            ],
         )
         assert isinstance(config.model, SpeechToSpeechConfig)
         assert config.model.s2s == "gemini_live"
-        assert config.model.s2s_params == {"api_key": "test-key"}
+        assert config.model.s2s_params == {"api_key": "test-key", "model": "gemini_live"}
 
     def test_s2s_config_with_params(self):
         """S2S params are passed through."""
         config = _config(
             env_vars=_EVA_MODEL_LIST_ENV,
-            model={"s2s": "gpt-realtime-mini", "s2s_params": {"voice": "alloy", "api_key": "key_1"}},
+            model={
+                "s2s": "gpt-realtime-mini",
+                "s2s_params": {"voice": "alloy", "api_key": "key_1", "model": "gpt-realtime-mini"},
+            },
         )
         assert isinstance(config.model, SpeechToSpeechConfig)
-        assert config.model.s2s_params == {"voice": "alloy", "api_key": "key_1"}
+        assert config.model.s2s_params == {"voice": "alloy", "api_key": "key_1", "model": "gpt-realtime-mini"}