diff --git a/.env.example b/.env.example index 7585eac7..d11adaaa 100644 --- a/.env.example +++ b/.env.example @@ -167,20 +167,16 @@ EVA_MODEL__LLM=gpt-5.2 # GOOGLE_API_KEY=your_google_api_key_here # ============================================== -# Optional: Realtime / Audio-LLM Configuration +# Optional: Speech-to-Speech / Audio-LLM Configuration # ============================================== -# Only needed if benchmarking speech-to-speech or realtime models. +# Only needed if benchmarking speech-to-speech models. -# EVA_MODEL__REALTIME_MODEL=gpt-realtime-mini -# EVA_MODEL__REALTIME_MODEL_PARAMS='{"voice":"marin"}' +# EVA_MODEL__S2S=openai +# EVA_MODEL__S2S_PARAMS='{"model": "gpt-realtime-mini", "api_key": ""}' # EVA_MODEL__AUDIO_LLM= # EVA_MODEL__AUDIO_LLM_PARAMS='{"url": "", "api_key": ""}' -# Azure Realtime credentials (if using Azure realtime models) -# AZURE_OPENAI_REALTIME_API_KEY= -# AZURE_OPENAI_REALTIME_ENDPOINT= - # ============================================== # Optional: Execution Settings # ============================================== diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index 2d00e8ff..ad62d66d 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -3,7 +3,6 @@ name: Tests on: merge_group: pull_request: - branches: [main] jobs: test: diff --git a/README.md b/README.md index 0ba0d575..2e492373 100644 --- a/README.md +++ b/README.md @@ -26,16 +26,25 @@ Agents that score well on task completion tend to score worse on conversational

Quick Start

+### Cloning the Repository + +If you're only interested in running the latest stable version of EVA, you can clone with `--branch latest`, and optionally speed things up with `--depth 1 --no-tags --single-branch`. +```bash +git clone https://github.com/ServiceNow/eva.git --branch latest --depth 1 --no-tags --single-branch +``` + +Otherwise, for development, you can clone the default branch, `main`. +```bash +git clone https://github.com/ServiceNow/eva.git +``` + ### Installation We recommend using [uv](https://docs.astral.sh/uv/) for fast, reliable dependency management. If you don't have `uv` installed, see the [uv installation guide](https://docs.astral.sh/uv/getting-started/installation/). -> [!NOTE] -> This project requires **Python 3.11–3.13** (set via `requires-python` in `pyproject.toml`). `uv` will automatically select a compatible version. If you're using pip, make sure you're running a supported Python version. +This project requires **Python 3.11–3.13** (set via `requires-python` in `pyproject.toml`). `uv` will automatically select a compatible version. If you're using pip, make sure you're running a supported Python version. ```bash -# Clone the repository -git clone https://github.com/ServiceNow/eva.git cd eva # Install all dependencies (uv automatically creates a virtual environment) @@ -46,18 +55,16 @@ cp .env.example .env # Edit .env with your API keys (ELEVENLABS_API_KEY, OPENAI_API_KEY required) ``` -> [!TIP] -> After installation, you can run EVA using either: -> - `eva` — CLI entry point (e.g., `eva --help`) -> - `python main.py` — script at the repo root (e.g., `python main.py --help`) -> -> If using an IDE, point your Python interpreter to `.venv/bin/python` so commands run in the virtual environment automatically. Otherwise, prefix commands with `uv run` or activate the environment with `source .venv/bin/activate`. +After installation, you can run EVA using either: +- `eva` — CLI entry point (e.g., `eva --help`) +- `python main.py` — script at the repo root (e.g., `python main.py --help`) + +If using an IDE, point your Python interpreter to `.venv/bin/python` so commands run in the virtual environment automatically. Otherwise, prefix commands with `uv run` or activate the environment with `source .venv/bin/activate`.
Alternative: using pip -> [!NOTE] -> This project requires Python 3.11. If you need to manage multiple Python versions, consider using [pyenv](https://github.com/pyenv/pyenv). +This project requires Python 3.11. If you need to manage multiple Python versions, consider using [pyenv](https://github.com/pyenv/pyenv). ```bash # Create and activate a virtual environment diff --git a/src/eva/assistant/pipeline/audio_llm_processor.py b/src/eva/assistant/pipeline/audio_llm_processor.py index a9154d4e..bb5b24b3 100644 --- a/src/eva/assistant/pipeline/audio_llm_processor.py +++ b/src/eva/assistant/pipeline/audio_llm_processor.py @@ -19,7 +19,6 @@ import asyncio import base64 import io -import os import time import wave from collections.abc import Awaitable @@ -418,7 +417,7 @@ def __init__( super().__init__(**kwargs) self._audio_collector = audio_collector params = params or {} - self._api_key = params.get("api_key") or os.getenv("OPENAI_API_KEY") + self._api_key = params.get["api_key"] self._model = model self._system_prompt = system_prompt or self.DEFAULT_SYSTEM_PROMPT self._sample_rate = sample_rate diff --git a/src/eva/assistant/pipeline/observers.py b/src/eva/assistant/pipeline/observers.py index df1a50d5..a3755d48 100644 --- a/src/eva/assistant/pipeline/observers.py +++ b/src/eva/assistant/pipeline/observers.py @@ -22,6 +22,7 @@ from pipecat.observers.turn_tracking_observer import TurnTrackingObserver from pipecat.services.azure.realtime.llm import AzureRealtimeLLMService from pipecat.services.llm_service import LLMService +from pipecat.services.openai.realtime.llm import OpenAIRealtimeLLMService from pipecat.services.stt_service import STTService from pipecat.services.tts_service import TTSService @@ -31,7 +32,7 @@ logger = get_logger(__name__) -_TRANSCRIPTION_SERVICES = (STTService, AzureRealtimeLLMService) +_TRANSCRIPTION_SERVICES = (STTService, AzureRealtimeLLMService, OpenAIRealtimeLLMService) class WallClock(SystemClock): diff --git a/src/eva/assistant/pipeline/realtime_llm.py b/src/eva/assistant/pipeline/realtime_llm.py index 7d30bac2..b502b4df 100644 --- a/src/eva/assistant/pipeline/realtime_llm.py +++ b/src/eva/assistant/pipeline/realtime_llm.py @@ -1,6 +1,6 @@ """Instrumented realtime LLM service for correct audit log ordering and timestamps. -Subclasses AzureRealtimeLLMService to intercept raw OpenAI Realtime API events +Subclasses OpenAIRealtimeLLMService to intercept raw OpenAI Realtime API events (speech_started, speech_stopped, transcription.completed, response.done) which have a guaranteed ordering and carry item_id for correlation. @@ -11,17 +11,24 @@ Writing user entries on #3 and assistant entries on #5 guarantees correct order. """ +import struct import time from dataclasses import dataclass from typing import Any, Optional -from pipecat.services.azure.realtime.llm import AzureRealtimeLLMService +from pipecat.frames.frames import Frame, InputAudioRawFrame, VADUserStartedSpeakingFrame, VADUserStoppedSpeakingFrame +from pipecat.processors.frame_processor import FrameDirection +from pipecat.services.openai.realtime.llm import OpenAIRealtimeLLMService from eva.assistant.agentic.audit_log import AuditLog from eva.utils.logging import get_logger logger = get_logger(__name__) +# Audio threshold for detecting speech vs silence +# RMS values below this are considered silence +SILENCE_RMS_THRESHOLD = 10 + @dataclass class _UserTurnRecord: @@ -39,8 +46,20 @@ def _wall_ms() -> str: return str(int(round(time.time() * 1000))) -class InstrumentedRealtimeLLMService(AzureRealtimeLLMService): - """AzureRealtimeLLMService subclass that writes audit log entries with correct ordering and wall-clock timestamps derived from Realtime API events. +def _calculate_rms(audio_bytes: bytes) -> float: + """Calculate RMS (root mean square) energy of 16-bit PCM audio.""" + if len(audio_bytes) < 2: + return 0.0 + num_samples = len(audio_bytes) // 2 + samples = struct.unpack(f"<{num_samples}h", audio_bytes[: num_samples * 2]) + if not samples: + return 0.0 + sum_squares = sum(s * s for s in samples) + return (sum_squares / len(samples)) ** 0.5 + + +class InstrumentedRealtimeLLMService(OpenAIRealtimeLLMService): + """OpenAIRealtimeLLMService subclass that writes audit log entries with correct ordering and wall-clock timestamps derived from Realtime API events. All overridden methods call ``super()`` first so that the parent's frame processing (audio playback, interruption handling, metrics, etc.) is fully @@ -61,12 +80,35 @@ def __init__(self, *, audit_log: AuditLog, **kwargs: Any) -> None: # Track whether we're mid-assistant-response (for interruption flushing) self._assistant_responding: bool = False + # Track audio frame timing for VAD delay calculation + self._last_audio_frame_time: Optional[float] = None + self._vad_delay_ms: Optional[int] = None + + async def process_frame(self, frame: Frame, direction: FrameDirection) -> None: + """Track audio frame timing before passing to parent. + + Only updates the timestamp when audio has actual speech content (not silence), + so VAD delay calculation reflects when user actually stopped speaking. + """ + if isinstance(frame, InputAudioRawFrame): + rms = _calculate_rms(frame.audio) + if rms > SILENCE_RMS_THRESHOLD: + self._last_audio_frame_time = time.time() + + await super().process_frame(frame, direction) + async def _handle_evt_speech_started(self, evt: Any) -> None: """Fires when user starts speaking (input_audio_buffer.speech_started). Captures wall-clock start time. Also flushes any in-progress interrupted assistant response before recording the new user turn. """ + # Reset VAD tracking for new turn + self._vad_delay_ms = None + + # Broadcast VAD user started speaking frame because realtime VAD does not broadcast it themselves + await self.broadcast_frame(VADUserStartedSpeakingFrame) + # Flush interrupted assistant response if one is in progress if self._assistant_responding and self._current_assistant_transcript_parts: partial_text = "".join(self._current_assistant_transcript_parts) + " [interrupted]" @@ -92,8 +134,21 @@ async def _handle_evt_speech_started(self, evt: Any) -> None: async def _handle_evt_speech_stopped(self, evt: Any) -> None: """Fires when user stops speaking (input_audio_buffer.speech_stopped). - Captures wall-clock end time for the user turn. + Captures wall-clock end time for the user turn and calculates VAD delay. """ + speech_stopped_time = time.time() + + # Calculate VAD delay: time between last audio frame and speech_stopped event + if self._last_audio_frame_time is not None: + self._vad_delay_ms = int((speech_stopped_time - self._last_audio_frame_time) * 1000) + else: + logger.warning("speech_stopped fired but no audio frames were tracked") + self._vad_delay_ms = None + + # Reset audio tracking for next turn + self._last_audio_frame_time = None + + await self.broadcast_frame(VADUserStoppedSpeakingFrame) await super()._handle_evt_speech_stopped(evt) item_id = getattr(evt, "item_id", None) or "" @@ -145,6 +200,7 @@ async def _handle_evt_audio_delta(self, evt: Any) -> None: """Fires for each audio chunk of the assistant response. Captures wall-clock of the *first* delta as assistant response start. + Also logs the full user-perceived response latency including VAD delay. """ await super()._handle_evt_audio_delta(evt) @@ -152,6 +208,24 @@ async def _handle_evt_audio_delta(self, evt: Any) -> None: self._assistant_response_start_wall_ms = _wall_ms() self._assistant_responding = True + # Log full user-perceived latency (includes VAD delay) + if self._vad_delay_ms is not None: + # Find the most recent user turn to get speech_stopped time + recent_record = None + for record in self._user_turns.values(): + if record.speech_stopped_wall_ms: + recent_record = record + + if recent_record and recent_record.speech_stopped_wall_ms: + speech_stopped_ms = int(recent_record.speech_stopped_wall_ms) + response_start_ms = int(self._assistant_response_start_wall_ms) + vad_to_response_ms = response_start_ms - speech_stopped_ms + full_latency_ms = vad_to_response_ms + self._vad_delay_ms + logger.debug( + f"Full response latency: {full_latency_ms}ms " + f"(VAD delay: {self._vad_delay_ms}ms + response: {vad_to_response_ms}ms)" + ) + async def _handle_evt_audio_transcript_delta(self, evt: Any) -> None: """Fires for incremental assistant transcript text. @@ -220,6 +294,16 @@ def _reset_assistant_state(self) -> None: self._assistant_response_start_wall_ms = None self._assistant_responding = False + @property + def last_vad_delay_ms(self) -> Optional[int]: + """Return the most recent VAD delay in milliseconds. + + This is the time between when audio frames stopped arriving and when + OpenAI's VAD detected end of speech. Can be used to adjust response + latency measurements to reflect user-perceived latency. + """ + return self._vad_delay_ms + @staticmethod def _response_has_function_calls(evt: Any) -> bool: """Return True if the response.done event contains any function_call outputs.""" diff --git a/src/eva/assistant/pipeline/services.py b/src/eva/assistant/pipeline/services.py index 1fcdf76d..c8ee3eff 100644 --- a/src/eva/assistant/pipeline/services.py +++ b/src/eva/assistant/pipeline/services.py @@ -4,7 +4,6 @@ """ import datetime -import os from typing import Any, AsyncGenerator, Optional from deepgram import LiveOptions @@ -20,7 +19,6 @@ AssemblyAIConnectionParams, AssemblyAISTTService, ) -from pipecat.services.azure.realtime.llm import AzureRealtimeLLMService from pipecat.services.cartesia.stt import CartesiaLiveOptions, CartesiaSTTService from pipecat.services.cartesia.tts import CartesiaTTSService from pipecat.services.deepgram.flux.stt import DeepgramFluxSTTService @@ -37,12 +35,14 @@ SemanticTurnDetection, SessionProperties, ) +from pipecat.services.openai.realtime.llm import OpenAIRealtimeLLMService from pipecat.services.openai.stt import OpenAISTTService from pipecat.services.openai.tts import VALID_VOICES, OpenAITTSService from pipecat.services.stt_service import STTService from pipecat.services.tts_service import TTSService from pipecat.transcriptions.language import Language from pipecat.utils.text.base_text_filter import BaseTextFilter +from websockets.asyncio.client import connect as websocket_connect from eva.assistant.pipeline.alm_vllm import ALMvLLMClient from eva.assistant.pipeline.nvidia_baseten import BasetenSTTService, BasetenTTSService @@ -381,6 +381,15 @@ def create_realtime_llm_service( """ model_lower = (model or "").lower() + # Get realtime server prompt + prompt_manager = PromptManager() + system_prompt = prompt_manager.get_prompt( + "realtime_agent.system_prompt", + agent_personality=agent.description, + agent_instructions=agent.instructions, + datetime=current_date_time, + ) + openai_tools = agent.build_tools_for_realtime() if agent else None # Convert OpenAI format tools to pipecat format @@ -400,66 +409,66 @@ def create_realtime_llm_service( ) pipecat_tools = ToolsSchema(standard_tools=function_schemas) - # Get realtime server prompt - prompt_manager = PromptManager() - system_prompt = prompt_manager.get_prompt( - "realtime_agent.system_prompt", - agent_personality=agent.description, - agent_instructions=agent.instructions, - datetime=current_date_time, - ) - - if model_lower.startswith("gpt-realtime"): - # - # base_url =The full Azure WebSocket endpoint URL including api-version and deployment. - # Example: "wss://my-project.openai.azure.com/openai/v1/realtime" - url = os.environ.get("AZURE_OPENAI_REALTIME_ENDPOINT", "") - url += f"?model={model_lower}" - - session_properties = SessionProperties( - instructions=system_prompt, - audio=AudioConfiguration( - input=AudioInput( - transcription=InputAudioTranscription(model="whisper-1"), - # Set openai TurnDetection parameters. Not setting this at all will turn it - # on by default - turn_detection=SemanticTurnDetection(), - # Or set to False to disable openai turn detection and use transport VAD - # turn_detection=False, - # noise_reduction=InputAudioNoiseReduction(type="near_field"), - ), - output=AudioOutput( - voice=params.get("voice", "marin"), + if model_lower.startswith("openai"): + session_properties = get_openai_session_properties(system_prompt, params, pipecat_tools) + if audit_log is not None: + logger.info(f"Using InstrumentedRealtimeLLMService for audit log interception: openai: {params['model']}") + return InstrumentedRealtimeLLMService( + settings=OpenAIRealtimeLLMService.Settings( + model=params["model"], + session_properties=session_properties, ), + audit_log=audit_log, + api_key=params["api_key"], + ) + + return OpenAIRealtimeLLMService( + api_key=params["api_key"], + settings=OpenAIRealtimeLLMService.Settings( + model=params["model"], + session_properties=session_properties, ), - tools=pipecat_tools, - tool_choice="auto", ) - logger.info(f"Using Azure Realtime LLM: {model_lower}") + elif model_lower.startswith("azure") or model_lower.startswith("gpt-realtime"): + # + # base_url: The full Azure WebSocket endpoint URL including api-version and deployment. + # Example: "wss://my-project.openai.azure.com/openai/v1/realtime" + url = params.get("url", "") + session_properties = get_openai_session_properties(system_prompt, params, pipecat_tools) + + logger.info(f"Using Azure Realtime LLM: {model_lower}, url {url}") if audit_log is not None: logger.info("Using InstrumentedRealtimeLLMService for audit log interception") - return InstrumentedRealtimeLLMService( - model=model_lower, + service = InstrumentedRealtimeLLMService( audit_log=audit_log, - api_key=os.environ.get("AZURE_OPENAI_REALTIME_API_KEY"), + api_key=params["api_key"], base_url=url, session_properties=session_properties, + settings=OpenAIRealtimeLLMService.Settings( + model=params["model"], + session_properties=session_properties, + ), ) + InstrumentedRealtimeLLMService._connect = override__connect # azure realtime connect + return service - return AzureRealtimeLLMService( - api_key=os.environ.get("AZURE_OPENAI_REALTIME_API_KEY"), + return OpenAIRealtimeLLMService( + api_key=params["api_key"], + model=params["model"], base_url=url, session_properties=session_properties, ) elif model_lower == "ultravox": + logger.info("Using Ultravox LLM") return UltravoxRealtimeLLMService( params=OneShotInputParams( - api_key=os.getenv("ULTRAVOX_API_KEY"), + api_key=params["api_key"], system_prompt=system_prompt, temperature=0.3, max_duration=datetime.timedelta(minutes=6), voice=params.get("voice", "03e20d03-35e4-43c4-bb18-9b18a2cd3086"), + model=params["model"], ), one_shot_selected_tools=pipecat_tools, ) @@ -468,6 +477,27 @@ def create_realtime_llm_service( raise ValueError(f"Unknown realtime model: {model}. Available: gpt-realtime, ultravox") +def get_openai_session_properties(system_prompt: str, params: dict, pipecat_tools) -> SessionProperties: + """Create openai compatible session properties object.""" + return SessionProperties( + instructions=system_prompt, + audio=AudioConfiguration( + input=AudioInput( + transcription=InputAudioTranscription( + model=params.get("transcription_model", "gpt-4o-mini-transcribe") + ), + # Set openai TurnDetection parameters. Not setting this at all will turn it on by default + turn_detection=SemanticTurnDetection(), + ), + output=AudioOutput( + voice=params.get("voice", "marin"), + ), + ), + tools=pipecat_tools, + tool_choice="auto", + ) + + def create_audio_llm_client( model: str, params: dict[str, Any], @@ -573,6 +603,27 @@ async def override_run_tts(self, text: str, context_id: str) -> AsyncGenerator[F yield ErrorFrame(error=f"Unknown error occurred: {e}") +async def override__connect(self): + # Allow connections to azure / other providers using a base_url + try: + if self._websocket: + # Here we assume that if we have a websocket, we are connected. We + # handle disconnections in the send/recv code paths. + return + + logger.info(f"Connecting to {self.base_url}") + self._websocket = await websocket_connect( + uri=self.base_url, + additional_headers={ + "api-key": self.api_key, + }, + ) + self._receive_task = self.create_task(self._receive_task_handler()) + except Exception as e: + await self.push_error(error_msg=f"initialization error: {e}", exception=e) + self._websocket = None + + # Unicode to ASCII replacements for TTS _TTS_CHAR_MAP = str.maketrans( { diff --git a/src/eva/assistant/server.py b/src/eva/assistant/server.py index 57a0fc2e..4282e894 100644 --- a/src/eva/assistant/server.py +++ b/src/eva/assistant/server.py @@ -326,7 +326,10 @@ async def _realtime_tool_handler(params) -> None: "smart_turn_stop_secs", 0.8 ) # Shorter silence so we don't have to wait 3s if smart turn marks audio as incomplete - if isinstance(self.pipeline_config, PipelineConfig) and self.pipeline_config.turn_strategy == "external": + if ( + isinstance(self.pipeline_config, (PipelineConfig, SpeechToSpeechConfig)) + and self.pipeline_config.turn_strategy == "external" + ): logger.info("Using external user turn strategies") user_turn_strategies = ExternalUserTurnStrategies() vad_analyzer = None @@ -444,9 +447,29 @@ async def on_user_transcription(text: str, timestamp: str, turn_id: int | None) self._latency_measurements = [] async def on_latency_measured(observer, latency_seconds: float): - """Event handler for UserBotLatencyObserver - stores latency measurements.""" - self._latency_measurements.append(latency_seconds) - logger.debug(f"Response latency captured: {latency_seconds:.3f}s") + """Event handler for UserBotLatencyObserver - stores latency measurements. + + For realtime LLM, adds VAD delay to get full user-perceived latency. + For pipecat VAD (non-realtime), uses the latency as-is. + """ + adjusted_latency = latency_seconds + + # Add VAD delay for realtime LLM to get full user-perceived latency + if isinstance(realtime_llm, InstrumentedRealtimeLLMService): + vad_delay_ms = realtime_llm.last_vad_delay_ms + if vad_delay_ms is not None: + vad_delay_s = vad_delay_ms / 1000.0 + adjusted_latency = latency_seconds + vad_delay_s + logger.debug( + f"Response latency captured: {adjusted_latency:.3f}s " + f"(VAD delay: {vad_delay_s:.3f}s + pipecat: {latency_seconds:.3f}s)" + ) + else: + logger.debug(f"Response latency captured: {latency_seconds:.3f}s (no VAD delay available)") + else: + logger.debug(f"Response latency captured: {latency_seconds:.3f}s") + + self._latency_measurements.append(adjusted_latency) user_bot_observer = UserBotLatencyObserver() user_bot_observer.add_event_handler("on_latency_measured", on_latency_measured) diff --git a/src/eva/models/config.py b/src/eva/models/config.py index 474d29a8..e08783bd 100644 --- a/src/eva/models/config.py +++ b/src/eva/models/config.py @@ -12,6 +12,8 @@ ``RunConfig(_env_file=".env", _cli_parse_args=True)``. """ +import copy +import logging from datetime import UTC, datetime from pathlib import Path from typing import Annotated, Any, ClassVar, Literal @@ -34,9 +36,12 @@ from eva.models.provenance import RunProvenance +logger = logging.getLogger(__name__) -def current_date_and_time(): - return f"{datetime.now(UTC):%Y-%m-%d_%H-%M-%S.%f}" + +def _param_alias(params: dict[str, Any]) -> str: + """Return the display alias from a params dict.""" + return params.get("alias") or params["model"] class PipelineConfig(BaseModel): @@ -73,6 +78,15 @@ class PipelineConfig(BaseModel): ), ) + @property + def pipeline_parts(self) -> dict[str, str]: + """Component names for this pipeline.""" + return { + "stt": _param_alias(self.stt_params), + "llm": self.llm, + "tts": _param_alias(self.tts_params), + } + @model_validator(mode="before") @classmethod def _migrate_legacy_fields(cls, data: Any) -> Any: @@ -97,6 +111,22 @@ class SpeechToSpeechConfig(BaseModel): s2s: str = Field(description="Speech-to-speech model name", examples=["gpt-realtime-mini", "gemini_live"]) s2s_params: dict[str, Any] = Field({}, description="Additional speech-to-speech model parameters (JSON)") + turn_strategy: Literal["smart", "external"] = Field( + "smart", + description=( + "User turn detection strategy. " + "'smart' uses LocalSmartTurnAnalyzerV3 + SileroVAD (default). " + "'external' uses ExternalUserTurnStrategies for services with built-in turn detection " + "(e.g., deepgram-flux, Speechmatics). " + "Set via EVA_MODEL__TURN_STRATEGY=external." + ), + ) + + @property + def pipeline_parts(self) -> dict[str, str]: + """Component names for this pipeline.""" + return {"s2s": _param_alias(self.s2s_params) or self.s2s} + class AudioLLMConfig(BaseModel): """Configuration for an Audio-LLM pipeline (audio in, text out, separate TTS). @@ -118,6 +148,14 @@ class AudioLLMConfig(BaseModel): tts: str = Field(description="TTS model", examples=["cartesia", "elevenlabs"]) tts_params: dict[str, Any] = Field({}, description="Additional TTS model parameters (JSON)") + @property + def pipeline_parts(self) -> dict[str, str]: + """Component names for this pipeline.""" + return { + "audio_llm": _param_alias(self.audio_llm_params) or self.audio_llm, + "tts": _param_alias(self.tts_params) or self.tts, + } + _PIPELINE_FIELDS = { "llm", @@ -129,7 +167,7 @@ class AudioLLMConfig(BaseModel): *PipelineConfig._LEGACY_RENAMES, *PipelineConfig._LEGACY_DROP, } -_S2S_FIELDS = {"s2s", "s2s_params"} +_S2S_FIELDS = {"s2s", "s2s_params", "turn_strategy"} _AUDIO_LLM_FIELDS = {"audio_llm", "audio_llm_params", "tts", "tts_params"} @@ -269,6 +307,18 @@ class RunConfig(BaseSettings): "EVA_METRICS_TO_RUN": "EVA_METRICS", } + # Maps *_params field names to their provider field for env override logic + _PARAMS_TO_PROVIDER: ClassVar[dict[str, str]] = { + "stt_params": "stt", + "tts_params": "tts", + "s2s_params": "s2s", + "audio_llm_params": "audio_llm", + } + # Keys always read from the live environment (not persisted across runs) + _ENV_OVERRIDE_KEYS: ClassVar[set[str]] = {"url", "urls"} + # Substrings that identify secret keys (redacted in logs and config.json) + _SECRET_KEY_PATTERNS: ClassVar[set[str]] = {"key", "credentials", "secret"} + class ModelDeployment(DeploymentTypedDict): """DeploymentTypedDict that preserves extra keys in litellm_params.""" @@ -283,7 +333,7 @@ class ModelDeployment(DeploymentTypedDict): # Run identifier run_id: str = Field( - default_factory=current_date_and_time, + "timestamp and model name(s)", # Overwritten by _set_default_run_id() description="Run identifier, auto-generated if not provided", ) @@ -441,22 +491,31 @@ def _warn_deprecated_aliases(cls, data: Any) -> Any: @model_validator(mode="after") def _check_companion_services(self) -> "RunConfig": """Ensure required companion services are set for each pipeline mode.""" + required_keys = ["api_key", "model"] if isinstance(self.model, PipelineConfig): - self._validate_service_params("STT", self.model.stt, self.model.stt_params) - self._validate_service_params("TTS", self.model.tts, self.model.tts_params) + self._validate_service_params("STT", self.model.stt, required_keys, self.model.stt_params) + self._validate_service_params("TTS", self.model.tts, required_keys, self.model.tts_params) elif isinstance(self.model, AudioLLMConfig): - self._validate_service_params("TTS", self.model.tts, self.model.tts_params) + self._validate_service_params("TTS", self.model.tts, required_keys, self.model.tts_params) + self._validate_service_params("audio_llm", self.model.audio_llm, required_keys, self.model.audio_llm_params) + elif isinstance(self.model, SpeechToSpeechConfig): + # api_key is required, some s2s services don't require model + self._validate_service_params("S2S", self.model.s2s, required_keys, self.model.s2s_params) return self - # Providers that manage their own model/key resolution (e.g. WebSocket-based) - _SKIP_PARAMS_VALIDATION: ClassVar[set[str]] = {"nvidia"} + @model_validator(mode="after") + def _set_default_run_id(self) -> "RunConfig": + if "run_id" not in self.model_fields_set: + suffix = "_".join(v for v in self.model.pipeline_parts.values() if v) + self.run_id = f"{datetime.now(UTC):%Y-%m-%d_%H-%M-%S.%f}_{suffix}" + return self @classmethod - def _validate_service_params(cls, service: str, provider: str, params: dict[str, Any]) -> None: + def _validate_service_params( + cls, service: str, provider: str, required_keys: list[str], params: dict[str, Any] + ) -> None: """Validate that STT/TTS params contain required keys.""" - if provider.lower() in cls._SKIP_PARAMS_VALIDATION: - return - missing = [key for key in ("api_key", "model") if key not in params] + missing = [key for key in required_keys if key not in params] if missing: missing_str = " and ".join(f'"{k}"' for k in missing) env_var = f"EVA_MODEL__{service}_PARAMS" @@ -485,20 +544,131 @@ def _expand_metrics_all(cls, v: list[str] | None) -> list[str] | None: return [m for m in get_global_registry().list_metrics() if m not in cls._VALIDATION_METRIC_NAMES] return v + @classmethod + def _is_secret_key(cls, key: str) -> bool: + """Return True if *key* matches any pattern in _SECRET_KEY_PATTERNS.""" + return any(pattern in key for pattern in cls._SECRET_KEY_PATTERNS) + + @classmethod + def _redact_dict(cls, params: dict) -> dict: + """Return a copy of *params* with secret values replaced by ``***``.""" + return {k: "***" if cls._is_secret_key(k) else v for k, v in params.items()} + @field_serializer("model_list") @classmethod def _redact_model_list(cls, deployments: list[ModelDeployment]) -> list[dict]: """Redact secret values in litellm_params when serializing.""" redacted = [] for deployment in deployments: + deployment = copy.deepcopy(deployment) if "litellm_params" in deployment: - params = deployment["litellm_params"] - for key in params: - if "key" in key or "credentials" in key: - params[key] = "***" + deployment["litellm_params"] = cls._redact_dict(deployment["litellm_params"]) redacted.append(deployment) return redacted + @field_serializer("model") + @classmethod + def _redact_model_params(cls, model: ModelConfigUnion) -> dict: + """Redact secret values in STT/TTS/S2S/AudioLLM params when serializing.""" + data = model.model_dump(mode="json") + for field_name, value in data.items(): + if field_name.endswith("_params") and isinstance(value, dict): + data[field_name] = cls._redact_dict(value) + return data + + def apply_env_overrides(self, live: "RunConfig") -> None: + """Apply environment-dependent values from *live* config onto this (saved) config. + + Restores redacted secrets (``***``) and overrides dynamic fields (``url``, + ``urls``) in ``model.*_params`` and ``model_list[].litellm_params``. + + Raises: + ValueError: If provider or alias differs for a service with redacted secrets. + """ + # ── model.*_params (STT / TTS / S2S / AudioLLM) ── + for params_field, provider_field in self._PARAMS_TO_PROVIDER.items(): + saved = getattr(self.model, params_field, None) + source = getattr(live.model, params_field, None) + if not isinstance(saved, dict) or not isinstance(source, dict): + continue + + has_redacted = any(v == "***" for v in saved.values()) + has_env_overrides = any(k in saved or k in source for k in self._ENV_OVERRIDE_KEYS) + if not has_redacted and not has_env_overrides: + continue + + if has_redacted: + saved_alias = saved.get("alias") + live_alias = source.get("alias") + if saved_alias and live_alias and saved_alias != live_alias: + raise ValueError( + f"Cannot restore secrets: saved {params_field}[alias]={saved_alias!r} " + f"but current environment has {params_field}[alias]={live_alias!r}" + ) + + saved_provider = getattr(self.model, provider_field, None) + live_provider = getattr(live.model, provider_field, None) + if saved_provider != live_provider: + logger.warning( + f"Provider mismatch for {params_field}: saved {saved_provider!r}, " + f"current environment has {live_provider!r}" + ) + + saved_model = saved.get("model") + live_model = source.get("model") + if saved_model and live_model and saved_model != live_model: + logger.warning( + f"Model mismatch for {params_field}: saved {saved_model!r}, " + f"current environment has {live_model!r}" + ) + + for key, value in saved.items(): + if value == "***" and key in source: + saved[key] = source[key] + + # Always use url/urls from the live environment + for key in self._ENV_OVERRIDE_KEYS: + if key in source: + saved_val = saved.get(key) + if saved_val and saved_val != source[key]: + logger.warning( + f"{params_field}[{key}] differs: saved {saved_val!r}, " + f"using {source[key]!r} from current environment" + ) + saved[key] = source[key] + + # ── model_list[].litellm_params (LLM deployments) ── + live_by_name = {d["model_name"]: d for d in live.model_list if "model_name" in d} + for deployment in self.model_list: + name = deployment.get("model_name") + if not name: + continue + saved_params = deployment.get("litellm_params", {}) + has_redacted = any(v == "***" for v in saved_params.values()) + if not has_redacted: + continue + if name not in live_by_name: + raise ValueError( + f"Cannot restore secrets: deployment {name!r} not found in " + f"current EVA_MODEL_LIST (available: {list(live_by_name)})" + ) + live_params = live_by_name[name].get("litellm_params", {}) + for key, value in saved_params.items(): + if value == "***" and key in live_params: + saved_params[key] = live_params[key] + + # ── Log resolved configuration ── + for params_field, provider_field in self._PARAMS_TO_PROVIDER.items(): + params = getattr(self.model, params_field, None) + provider = getattr(self.model, provider_field, None) + if isinstance(params, dict) and params: + logger.info(f"Resolved {provider_field} ({provider}): {self._redact_dict(params)}") + + for deployment in self.model_list: + name = deployment.get("model_name", "?") + params = deployment.get("litellm_params", {}) + logger.info(f"Resolved deployment {name}: {self._redact_dict(params)}") + @classmethod def from_yaml(cls, path: Path | str) -> "RunConfig": """Load configuration from YAML file.""" diff --git a/src/eva/orchestrator/runner.py b/src/eva/orchestrator/runner.py index f92d98af..ac5a45f3 100644 --- a/src/eva/orchestrator/runner.py +++ b/src/eva/orchestrator/runner.py @@ -138,7 +138,10 @@ async def run(self, records: list[EvaluationRecord]) -> RunResult: } config_path = self.output_dir / "config.json" - config_path.write_text(self.config.model_dump_json(indent=2)) + config_data = self.config.model_dump(mode="json") + pipeline_parts = self.config.model.pipeline_parts + config_data["pipeline_parts"] = pipeline_parts + config_path.write_text(json.dumps(config_data, indent=2)) # Build output_id list for tracking (supports pass@k) num_trials = self.config.num_trials diff --git a/src/eva/run_benchmark.py b/src/eva/run_benchmark.py index 92d32b01..49096448 100644 --- a/src/eva/run_benchmark.py +++ b/src/eva/run_benchmark.py @@ -42,6 +42,9 @@ async def run_benchmark(config: RunConfig) -> int: logger.error(str(e)) return 1 + # Apply env-dependent values (secrets, urls) from live env onto saved config + runner.config.apply_env_overrides(config) + # Apply CLI overrides runner.config.max_rerun_attempts = config.max_rerun_attempts runner.config.force_rerun_metrics = config.force_rerun_metrics diff --git a/src/eva/utils/prompt_manager.py b/src/eva/utils/prompt_manager.py index 2216fddc..56971149 100644 --- a/src/eva/utils/prompt_manager.py +++ b/src/eva/utils/prompt_manager.py @@ -121,7 +121,7 @@ def get_prompt(self, path: str, **variables) -> str: return value.format(**formatted_vars) except KeyError as e: raise KeyError( - "Missing variable {e} for prompt '{path}'. Available variables: {sorted(formatted_vars.keys())}" + f"Missing variable {e} for prompt '{path}'. Available variables: {sorted(formatted_vars.keys())}" ) from e diff --git a/tests/unit/models/test_config_models.py b/tests/unit/models/test_config_models.py index 8248c39f..50f22c73 100644 --- a/tests/unit/models/test_config_models.py +++ b/tests/unit/models/test_config_models.py @@ -2,7 +2,6 @@ import json import os -from datetime import datetime from pathlib import Path from unittest.mock import MagicMock, patch @@ -56,6 +55,10 @@ "EVA_MODEL__STT_PARAMS": json.dumps({"api_key": "test_key", "model": "nova-2"}), "EVA_MODEL__TTS_PARAMS": json.dumps({"api_key": "test_key", "model": "sonic"}), } +_S2S_ENV = _EVA_MODEL_LIST_ENV | { + "EVA_MODEL__S2S": "gpt-realtime-mini", + "EVA_MODEL__S2S_PARAMS": json.dumps({"api_key": "", "model": "test"}), +} def _config( @@ -74,6 +77,12 @@ def _config( return RunConfig(_env_file=env_file, _cli_parse_args=cli_args, **kwargs) +def _load_json_into_runconfig(json_str: str) -> RunConfig: + """Load RunConfig from JSON with isolated environment (no real env vars).""" + with patch.dict(os.environ, {}, clear=True): + return RunConfig.model_validate_json(json_str) + + class TestRunConfig: def test_create_minimal_config(self): """Test creating a minimal RunConfig.""" @@ -81,7 +90,8 @@ def test_create_minimal_config(self): assert config.dataset_path == Path("data/airline_dataset.jsonl") assert config.tool_mocks_path == Path("data/airline_scenarios") - assert datetime.strptime(config.run_id, "%Y-%m-%d_%H-%M-%S.%f") + # run_id = timestamp + model suffix (e.g. "2024-01-15_14-30-45.123456_nova-2_gpt-5.2_sonic") + assert config.run_id.endswith("nova-2_gpt-5.2_sonic") assert config.max_concurrent_conversations == 1 assert config.conversation_timeout_seconds == 360 @@ -160,13 +170,172 @@ def test_indentation_in_model_list(self, tmp_path: Path, vars_location: str, ind assert config.model_list == MODEL_LIST def test_secrets_redacted(self): - """Secrets are redacted in model_list.""" + """Secrets are redacted in model_list and STT/TTS params.""" config = _config(env_vars=_BASE_ENV) dumped = config.model_dump(mode="json") assert dumped["model_list"][0]["litellm_params"]["api_key"] == "***" assert dumped["model_list"][1]["litellm_params"]["vertex_credentials"] == "***" assert dumped["model_list"][2]["litellm_params"]["aws_access_key_id"] == "***" assert dumped["model_list"][2]["litellm_params"]["aws_secret_access_key"] == "***" + # STT/TTS params api_key must also be redacted + assert dumped["model"]["stt_params"]["api_key"] == "***" + assert dumped["model"]["tts_params"]["api_key"] == "***" + # Non-secret fields preserved + assert dumped["model"]["stt_params"]["model"] == "nova-2" + assert dumped["model"]["tts_params"]["model"] == "sonic" + + def test_secrets_redaction_does_not_mutate_live_config(self): + """Serializing must not corrupt the in-memory config objects.""" + config = _config(env_vars=_BASE_ENV) + config.model_dump(mode="json") + # model_list keys must still hold real values + assert config.model_list[0]["litellm_params"]["api_key"] == "must_be_redacted" + assert config.model_list[1]["litellm_params"]["vertex_credentials"] == "must_be_redacted" + # STT/TTS params must still hold real values + assert config.model.stt_params["api_key"] == "test_key" + assert config.model.tts_params["api_key"] == "test_key" + + def test_apply_env_overrides(self): + """Redacted secrets are restored from a live config for both model and model_list.""" + config = _config(env_vars=_BASE_ENV) + dumped_json = config.model_dump_json() + loaded = _load_json_into_runconfig(dumped_json) + + # Everything is redacted after round-trip + assert loaded.model.stt_params["api_key"] == "***" + assert loaded.model.tts_params["api_key"] == "***" + assert loaded.model_list[0]["litellm_params"]["api_key"] == "***" + assert loaded.model_list[1]["litellm_params"]["vertex_credentials"] == "***" + assert loaded.model_list[2]["litellm_params"]["aws_access_key_id"] == "***" + + loaded.apply_env_overrides(config) + + # STT/TTS params restored + assert loaded.model.stt_params["api_key"] == "test_key" + assert loaded.model.tts_params["api_key"] == "test_key" + assert loaded.model.stt_params["model"] == "nova-2" + # model_list restored + assert loaded.model_list[0]["litellm_params"]["api_key"] == "must_be_redacted" + assert loaded.model_list[1]["litellm_params"]["vertex_credentials"] == "must_be_redacted" + assert loaded.model_list[2]["litellm_params"]["aws_access_key_id"] == "must_be_redacted" + assert loaded.model_list[2]["litellm_params"]["aws_secret_access_key"] == "must_be_redacted" + + def test_apply_env_overrides_provider_mismatch(self, caplog): + """Restoring secrets warns (but succeeds) if the STT/TTS provider changed.""" + config = _config(env_vars=_BASE_ENV) + dumped_json = config.model_dump_json() + loaded = _load_json_into_runconfig(dumped_json) + + live = _config( + env_vars=_BASE_ENV + | { + "EVA_MODEL__STT": "openai_whisper", + "EVA_MODEL__STT_PARAMS": json.dumps({"api_key": "k", "model": "whisper-1"}), + } + ) + with caplog.at_level("WARNING", logger="eva.models.config"): + loaded.apply_env_overrides(live) + assert "Provider mismatch for stt_params" in caplog.text + assert "deepgram" in caplog.text + assert "openai_whisper" in caplog.text + + def test_apply_env_overrides_alias_mismatch(self): + """Restoring secrets fails if the alias changed.""" + config = _config( + env_vars=_BASE_ENV + | { + "EVA_MODEL__STT_PARAMS": json.dumps({"api_key": "k", "model": "nova-2", "alias": "stt-v1"}), + } + ) + dumped_json = config.model_dump_json() + loaded = _load_json_into_runconfig(dumped_json) + + live = _config( + env_vars=_BASE_ENV + | { + "EVA_MODEL__STT_PARAMS": json.dumps({"api_key": "k", "model": "nova-2", "alias": "stt-v2"}), + } + ) + with pytest.raises( + ValueError, + match=r"saved stt_params\[alias\]='stt-v1'.*current environment has stt_params\[alias\]='stt-v2'", + ): + loaded.apply_env_overrides(live) + + def test_apply_env_overrides_model_mismatch_warns(self, caplog): + """Restoring secrets warns (but succeeds) if the STT/TTS model changed.""" + config = _config(env_vars=_BASE_ENV) + dumped_json = config.model_dump_json() + loaded = _load_json_into_runconfig(dumped_json) + + live = _config(env_vars=_BASE_ENV | {"EVA_MODEL__TTS_PARAMS": json.dumps({"api_key": "k", "model": "sonic-2"})}) + with caplog.at_level("WARNING", logger="eva.models.config"): + loaded.apply_env_overrides(live) + assert "sonic" in caplog.text + assert "sonic-2" in caplog.text + assert loaded.model.tts_params["api_key"] == "k" + + def test_apply_env_overrides_url_from_env(self, caplog): + """Url is always taken from the live env, with a warning if it differs.""" + saved_env = _BASE_ENV | { + "EVA_MODEL__STT_PARAMS": json.dumps({"api_key": "k", "model": "nova-2", "url": "wss://old-host/stt"}), + } + config = _config(env_vars=saved_env) + dumped_json = config.model_dump_json() + loaded = _load_json_into_runconfig(dumped_json) + + # Live env has a different url + live_env = _BASE_ENV | { + "EVA_MODEL__STT_PARAMS": json.dumps({"api_key": "k", "model": "nova-2", "url": "wss://new-host/stt"}), + } + live = _config(env_vars=live_env) + + with caplog.at_level("WARNING", logger="eva.models.config"): + loaded.apply_env_overrides(live) + + assert loaded.model.stt_params["url"] == "wss://new-host/stt" + assert "wss://old-host/stt" in caplog.text + assert "wss://new-host/stt" in caplog.text + + def test_apply_env_overrides_url_added_from_env(self): + """Url from live env is added even if the saved config didn't have one.""" + config = _config(env_vars=_BASE_ENV) + dumped_json = config.model_dump_json() + loaded = _load_json_into_runconfig(dumped_json) + + live_env = _BASE_ENV | { + "EVA_MODEL__STT_PARAMS": json.dumps({"api_key": "k", "model": "nova-2", "url": "wss://new-host/stt"}), + } + live = _config(env_vars=live_env) + loaded.apply_env_overrides(live) + + assert loaded.model.stt_params["url"] == "wss://new-host/stt" + + def test_apply_env_overrides_llm_deployment_mismatch(self): + """Restoring secrets fails if a saved LLM deployment is missing from the live model_list.""" + config = _config(env_vars=_BASE_ENV) + dumped_json = config.model_dump_json() + loaded = _load_json_into_runconfig(dumped_json) + + # Live config has a different model_list (only one deployment, different name) + different_model_list = [ + { + "model_name": "gpt-4o", + "litellm_params": {"model": "openai/gpt-4o", "api_key": "real_key"}, + } + ] + live = _config( + env_vars={ + "EVA_MODEL_LIST": json.dumps(different_model_list), + "EVA_MODEL__LLM": "gpt-4o", + "EVA_MODEL__STT": "deepgram", + "EVA_MODEL__TTS": "cartesia", + "EVA_MODEL__STT_PARAMS": json.dumps({"api_key": "k", "model": "nova-2"}), + "EVA_MODEL__TTS_PARAMS": json.dumps({"api_key": "k", "model": "sonic"}), + } + ) + with pytest.raises(ValueError, match=r"deployment 'gpt-5.2' not found in current EVA_MODEL_LIST"): + loaded.apply_env_overrides(live) @pytest.mark.parametrize( "environ, expected_exception, expected_message", @@ -287,20 +456,6 @@ def test_missing_stt_tts_params(self): } ) - def test_nvidia_stt_skips_params_validation(self): - """NVIDIA STT skips api_key/model validation (uses url-based config).""" - config = _config( - env_vars=_EVA_MODEL_LIST_ENV - | { - "EVA_MODEL__LLM": "gpt-5.2", - "EVA_MODEL__STT": "nvidia", - "EVA_MODEL__TTS": "cartesia", - "EVA_MODEL__STT_PARAMS": json.dumps({"url": "ws://localhost:8000"}), - "EVA_MODEL__TTS_PARAMS": json.dumps({"api_key": "k", "model": "sonic"}), - } - ) - assert config.model.stt == "nvidia" - class TestDefaults: """Verify default values match expectations.""" @@ -356,14 +511,14 @@ class TestDeprecatedEnvVars: lambda c: c.model.tts, ), ( - _EVA_MODEL_LIST_ENV, + _S2S_ENV, "REALTIME_MODEL", "EVA_MODEL__S2S", "test-model", lambda c: c.model.s2s, ), ( - _EVA_MODEL_LIST_ENV, + _S2S_ENV, "EVA_MODEL__REALTIME_MODEL", "EVA_MODEL__S2S", "test-model", @@ -384,17 +539,17 @@ class TestDeprecatedEnvVars: lambda c: c.model.tts_params, ), ( - _EVA_MODEL_LIST_ENV | {"EVA_MODEL__S2S": "test-model"}, + _S2S_ENV, "REALTIME_MODEL_PARAMS", "EVA_MODEL__S2S_PARAMS", - {"foo": "bar"}, + {"api_key": "k", "model": "model"}, lambda c: c.model.s2s_params, ), ( - _EVA_MODEL_LIST_ENV | {"EVA_MODEL__S2S": "test-model"}, + _S2S_ENV, "EVA_MODEL__REALTIME_MODEL_PARAMS", "EVA_MODEL__S2S_PARAMS", - {"foo": "bar"}, + {"api_key": "k", "model": "model"}, lambda c: c.model.s2s_params, ), ( @@ -581,7 +736,7 @@ def test_tts_model(self): assert c.model.tts == "cartesia" def test_realtime_model(self): - config = _config(env_vars=_EVA_MODEL_LIST_ENV, cli_args=["--realtime-model", "test-model"]) + config = _config(env_vars=_S2S_ENV, cli_args=["--realtime-model", "test-model"]) assert config.model.s2s == "test-model" def test_run_id(self): @@ -652,20 +807,39 @@ class TestSpeechToSpeechConfig: def test_s2s_config_from_env(self): """EVA_MODEL__S2S selects SpeechToSpeechConfig.""" - config = _config(env_vars=_EVA_MODEL_LIST_ENV | {"EVA_MODEL__S2S": "gpt-realtime-mini"}) + config = _config( + env_vars=_EVA_MODEL_LIST_ENV + | { + "EVA_MODEL__S2S": "gpt-realtime-mini", + "EVA_MODEL__S2S_PARAMS": json.dumps({"api_key": "", "model": "gpt-realtime-mini"}), + } + ) assert isinstance(config.model, SpeechToSpeechConfig) assert config.model.s2s == "gpt-realtime-mini" def test_s2s_config_from_cli(self): """--s2s-model selects SpeechToSpeechConfig.""" - config = _config(env_vars=_EVA_MODEL_LIST_ENV, cli_args=["--model.s2s", "gemini_live"]) + config = _config( + env_vars=_EVA_MODEL_LIST_ENV, + cli_args=[ + "--model.s2s", + "gemini_live", + "--model.s2s-params", + '{"api_key": "test-key", "model": "gemini_live"}', + ], + ) assert isinstance(config.model, SpeechToSpeechConfig) assert config.model.s2s == "gemini_live" + assert config.model.s2s_params == {"api_key": "test-key", "model": "gemini_live"} def test_s2s_config_with_params(self): """S2S params are passed through.""" config = _config( - env_vars=_EVA_MODEL_LIST_ENV, model={"s2s": "gpt-realtime-mini", "s2s_params": {"voice": "alloy"}} + env_vars=_EVA_MODEL_LIST_ENV, + model={ + "s2s": "gpt-realtime-mini", + "s2s_params": {"voice": "alloy", "api_key": "key_1", "model": "gpt-realtime-mini"}, + }, ) assert isinstance(config.model, SpeechToSpeechConfig) - assert config.model.s2s_params == {"voice": "alloy"} + assert config.model.s2s_params == {"voice": "alloy", "api_key": "key_1", "model": "gpt-realtime-mini"}