From e5ac134d8411d3c51f20ea548ff49282a91d3442 Mon Sep 17 00:00:00 2001 From: Katrina Date: Wed, 25 Mar 2026 20:28:00 -0400 Subject: [PATCH 01/25] add support for openai realtime model. Add vad tracking for realtime models (using external vad) --- .env.example | 12 +-- src/eva/assistant/pipeline/observers.py | 3 +- src/eva/assistant/pipeline/realtime_llm.py | 94 ++++++++++++++++- src/eva/assistant/pipeline/services.py | 116 ++++++++++++++------- src/eva/assistant/server.py | 31 +++++- src/eva/models/config.py | 13 ++- src/eva/utils/prompt_manager.py | 2 +- 7 files changed, 212 insertions(+), 59 deletions(-) diff --git a/.env.example b/.env.example index 061dd906..7398c5d0 100644 --- a/.env.example +++ b/.env.example @@ -167,20 +167,16 @@ EVA_MODEL__LLM=gpt-5.2 # GOOGLE_API_KEY=your_google_api_key_here # ============================================== -# Optional: Realtime / Audio-LLM Configuration +# Optional: Speech-to-Speech / Audio-LLM Configuration # ============================================== -# Only needed if benchmarking speech-to-speech or realtime models. +# Only needed if benchmarking speech-to-speech models. -# EVA_MODEL__REALTIME_MODEL=gpt-realtime-mini -# EVA_MODEL__REALTIME_MODEL_PARAMS='{"voice":"marin"}' +# EVA_MODEL__S2S=openai +# EVA_MODEL__S2S_PARAMS='{"model": "gpt-realtime-mini", "voice": "marin"}' # EVA_MODEL__AUDIO_LLM= # EVA_MODEL__AUDIO_LLM_PARAMS='{"url": "", "api_key": ""}' -# Azure Realtime credentials (if using Azure realtime models) -# AZURE_OPENAI_REALTIME_API_KEY= -# AZURE_OPENAI_REALTIME_ENDPOINT= - # ============================================== # Optional: Execution Settings # ============================================== diff --git a/src/eva/assistant/pipeline/observers.py b/src/eva/assistant/pipeline/observers.py index df1a50d5..a3755d48 100644 --- a/src/eva/assistant/pipeline/observers.py +++ b/src/eva/assistant/pipeline/observers.py @@ -22,6 +22,7 @@ from pipecat.observers.turn_tracking_observer import TurnTrackingObserver from pipecat.services.azure.realtime.llm import AzureRealtimeLLMService from pipecat.services.llm_service import LLMService +from pipecat.services.openai.realtime.llm import OpenAIRealtimeLLMService from pipecat.services.stt_service import STTService from pipecat.services.tts_service import TTSService @@ -31,7 +32,7 @@ logger = get_logger(__name__) -_TRANSCRIPTION_SERVICES = (STTService, AzureRealtimeLLMService) +_TRANSCRIPTION_SERVICES = (STTService, AzureRealtimeLLMService, OpenAIRealtimeLLMService) class WallClock(SystemClock): diff --git a/src/eva/assistant/pipeline/realtime_llm.py b/src/eva/assistant/pipeline/realtime_llm.py index 7d30bac2..b502b4df 100644 --- a/src/eva/assistant/pipeline/realtime_llm.py +++ b/src/eva/assistant/pipeline/realtime_llm.py @@ -1,6 +1,6 @@ """Instrumented realtime LLM service for correct audit log ordering and timestamps. -Subclasses AzureRealtimeLLMService to intercept raw OpenAI Realtime API events +Subclasses OpenAIRealtimeLLMService to intercept raw OpenAI Realtime API events (speech_started, speech_stopped, transcription.completed, response.done) which have a guaranteed ordering and carry item_id for correlation. @@ -11,17 +11,24 @@ Writing user entries on #3 and assistant entries on #5 guarantees correct order. """ +import struct import time from dataclasses import dataclass from typing import Any, Optional -from pipecat.services.azure.realtime.llm import AzureRealtimeLLMService +from pipecat.frames.frames import Frame, InputAudioRawFrame, VADUserStartedSpeakingFrame, VADUserStoppedSpeakingFrame +from pipecat.processors.frame_processor import FrameDirection +from pipecat.services.openai.realtime.llm import OpenAIRealtimeLLMService from eva.assistant.agentic.audit_log import AuditLog from eva.utils.logging import get_logger logger = get_logger(__name__) +# Audio threshold for detecting speech vs silence +# RMS values below this are considered silence +SILENCE_RMS_THRESHOLD = 10 + @dataclass class _UserTurnRecord: @@ -39,8 +46,20 @@ def _wall_ms() -> str: return str(int(round(time.time() * 1000))) -class InstrumentedRealtimeLLMService(AzureRealtimeLLMService): - """AzureRealtimeLLMService subclass that writes audit log entries with correct ordering and wall-clock timestamps derived from Realtime API events. +def _calculate_rms(audio_bytes: bytes) -> float: + """Calculate RMS (root mean square) energy of 16-bit PCM audio.""" + if len(audio_bytes) < 2: + return 0.0 + num_samples = len(audio_bytes) // 2 + samples = struct.unpack(f"<{num_samples}h", audio_bytes[: num_samples * 2]) + if not samples: + return 0.0 + sum_squares = sum(s * s for s in samples) + return (sum_squares / len(samples)) ** 0.5 + + +class InstrumentedRealtimeLLMService(OpenAIRealtimeLLMService): + """OpenAIRealtimeLLMService subclass that writes audit log entries with correct ordering and wall-clock timestamps derived from Realtime API events. All overridden methods call ``super()`` first so that the parent's frame processing (audio playback, interruption handling, metrics, etc.) is fully @@ -61,12 +80,35 @@ def __init__(self, *, audit_log: AuditLog, **kwargs: Any) -> None: # Track whether we're mid-assistant-response (for interruption flushing) self._assistant_responding: bool = False + # Track audio frame timing for VAD delay calculation + self._last_audio_frame_time: Optional[float] = None + self._vad_delay_ms: Optional[int] = None + + async def process_frame(self, frame: Frame, direction: FrameDirection) -> None: + """Track audio frame timing before passing to parent. + + Only updates the timestamp when audio has actual speech content (not silence), + so VAD delay calculation reflects when user actually stopped speaking. + """ + if isinstance(frame, InputAudioRawFrame): + rms = _calculate_rms(frame.audio) + if rms > SILENCE_RMS_THRESHOLD: + self._last_audio_frame_time = time.time() + + await super().process_frame(frame, direction) + async def _handle_evt_speech_started(self, evt: Any) -> None: """Fires when user starts speaking (input_audio_buffer.speech_started). Captures wall-clock start time. Also flushes any in-progress interrupted assistant response before recording the new user turn. """ + # Reset VAD tracking for new turn + self._vad_delay_ms = None + + # Broadcast VAD user started speaking frame because realtime VAD does not broadcast it themselves + await self.broadcast_frame(VADUserStartedSpeakingFrame) + # Flush interrupted assistant response if one is in progress if self._assistant_responding and self._current_assistant_transcript_parts: partial_text = "".join(self._current_assistant_transcript_parts) + " [interrupted]" @@ -92,8 +134,21 @@ async def _handle_evt_speech_started(self, evt: Any) -> None: async def _handle_evt_speech_stopped(self, evt: Any) -> None: """Fires when user stops speaking (input_audio_buffer.speech_stopped). - Captures wall-clock end time for the user turn. + Captures wall-clock end time for the user turn and calculates VAD delay. """ + speech_stopped_time = time.time() + + # Calculate VAD delay: time between last audio frame and speech_stopped event + if self._last_audio_frame_time is not None: + self._vad_delay_ms = int((speech_stopped_time - self._last_audio_frame_time) * 1000) + else: + logger.warning("speech_stopped fired but no audio frames were tracked") + self._vad_delay_ms = None + + # Reset audio tracking for next turn + self._last_audio_frame_time = None + + await self.broadcast_frame(VADUserStoppedSpeakingFrame) await super()._handle_evt_speech_stopped(evt) item_id = getattr(evt, "item_id", None) or "" @@ -145,6 +200,7 @@ async def _handle_evt_audio_delta(self, evt: Any) -> None: """Fires for each audio chunk of the assistant response. Captures wall-clock of the *first* delta as assistant response start. + Also logs the full user-perceived response latency including VAD delay. """ await super()._handle_evt_audio_delta(evt) @@ -152,6 +208,24 @@ async def _handle_evt_audio_delta(self, evt: Any) -> None: self._assistant_response_start_wall_ms = _wall_ms() self._assistant_responding = True + # Log full user-perceived latency (includes VAD delay) + if self._vad_delay_ms is not None: + # Find the most recent user turn to get speech_stopped time + recent_record = None + for record in self._user_turns.values(): + if record.speech_stopped_wall_ms: + recent_record = record + + if recent_record and recent_record.speech_stopped_wall_ms: + speech_stopped_ms = int(recent_record.speech_stopped_wall_ms) + response_start_ms = int(self._assistant_response_start_wall_ms) + vad_to_response_ms = response_start_ms - speech_stopped_ms + full_latency_ms = vad_to_response_ms + self._vad_delay_ms + logger.debug( + f"Full response latency: {full_latency_ms}ms " + f"(VAD delay: {self._vad_delay_ms}ms + response: {vad_to_response_ms}ms)" + ) + async def _handle_evt_audio_transcript_delta(self, evt: Any) -> None: """Fires for incremental assistant transcript text. @@ -220,6 +294,16 @@ def _reset_assistant_state(self) -> None: self._assistant_response_start_wall_ms = None self._assistant_responding = False + @property + def last_vad_delay_ms(self) -> Optional[int]: + """Return the most recent VAD delay in milliseconds. + + This is the time between when audio frames stopped arriving and when + OpenAI's VAD detected end of speech. Can be used to adjust response + latency measurements to reflect user-perceived latency. + """ + return self._vad_delay_ms + @staticmethod def _response_has_function_calls(evt: Any) -> bool: """Return True if the response.done event contains any function_call outputs.""" diff --git a/src/eva/assistant/pipeline/services.py b/src/eva/assistant/pipeline/services.py index da83b77b..e4ce760c 100644 --- a/src/eva/assistant/pipeline/services.py +++ b/src/eva/assistant/pipeline/services.py @@ -20,7 +20,6 @@ AssemblyAIConnectionParams, AssemblyAISTTService, ) -from pipecat.services.azure.realtime.llm import AzureRealtimeLLMService from pipecat.services.cartesia.stt import CartesiaLiveOptions, CartesiaSTTService from pipecat.services.cartesia.tts import CartesiaTTSService from pipecat.services.deepgram.flux.stt import DeepgramFluxSTTService @@ -36,12 +35,14 @@ SemanticTurnDetection, SessionProperties, ) +from pipecat.services.openai.realtime.llm import OpenAIRealtimeLLMService from pipecat.services.openai.stt import OpenAISTTService from pipecat.services.openai.tts import VALID_VOICES, OpenAITTSService from pipecat.services.stt_service import STTService from pipecat.services.tts_service import TTSService from pipecat.transcriptions.language import Language from pipecat.utils.text.base_text_filter import BaseTextFilter +from websockets.asyncio.client import connect as websocket_connect from eva.assistant.pipeline.alm_vllm import ALMvLLMClient from eva.assistant.pipeline.nvidia_baseten import BasetenSTTService, BasetenTTSService @@ -371,6 +372,15 @@ def create_realtime_llm_service( """ model_lower = (model or "").lower() + # Get realtime server prompt + prompt_manager = PromptManager() + system_prompt = prompt_manager.get_prompt( + "realtime_agent.system_prompt", + agent_personality=agent.description, + agent_instructions=agent.instructions, + datetime=current_date_time, + ) + openai_tools = agent.build_tools_for_realtime() if agent else None # Convert OpenAI format tools to pipecat format @@ -390,62 +400,70 @@ def create_realtime_llm_service( ) pipecat_tools = ToolsSchema(standard_tools=function_schemas) - # Get realtime server prompt - prompt_manager = PromptManager() - system_prompt = prompt_manager.get_prompt( - "realtime_agent.system_prompt", - agent_personality=agent.description, - agent_instructions=agent.instructions, - datetime=current_date_time, + session_properties = SessionProperties( + instructions=system_prompt, + audio=AudioConfiguration( + input=AudioInput( + transcription=InputAudioTranscription( + model=params.get("transcription_model", "gpt-4o-mini-transcribe") + ), + # Set openai TurnDetection parameters. Not setting this at all will turn it on by default + turn_detection=SemanticTurnDetection(), + ), + output=AudioOutput( + voice=params.get("voice", "marin"), + ), + ), + tools=pipecat_tools, + tool_choice="auto", ) - if model_lower.startswith("gpt-realtime"): + if model_lower.startswith("openai"): + if audit_log is not None: + logger.info( + f"Using InstrumentedRealtimeLLMService for audit log interception: openai: {params.get('model')}" + ) + return InstrumentedRealtimeLLMService( + model=params.get("model"), + audit_log=audit_log, + api_key=params.get("api_key") or os.getenv("OPENAI_API_KEY"), + session_properties=session_properties, + ) + + return OpenAIRealtimeLLMService( + api_key=params.get("api_key"), + session_properties=session_properties, + ) + elif model_lower.startswith("azure") or model_lower.startswith("gpt-realtime"): # - # base_url =The full Azure WebSocket endpoint URL including api-version and deployment. + # base_url: The full Azure WebSocket endpoint URL including api-version and deployment. # Example: "wss://my-project.openai.azure.com/openai/v1/realtime" - url = os.environ.get("AZURE_OPENAI_REALTIME_ENDPOINT", "") - url += f"?model={model_lower}" - - session_properties = SessionProperties( - instructions=system_prompt, - audio=AudioConfiguration( - input=AudioInput( - transcription=InputAudioTranscription(model="whisper-1"), - # Set openai TurnDetection parameters. Not setting this at all will turn it - # on by default - turn_detection=SemanticTurnDetection(), - # Or set to False to disable openai turn detection and use transport VAD - # turn_detection=False, - # noise_reduction=InputAudioNoiseReduction(type="near_field"), - ), - output=AudioOutput( - voice=params.get("voice", "marin"), - ), - ), - tools=pipecat_tools, - tool_choice="auto", - ) - logger.info(f"Using Azure Realtime LLM: {model_lower}") + url = params.get("url", "") + + logger.info(f"Using Azure Realtime LLM: {model_lower}, url {url}") if audit_log is not None: logger.info("Using InstrumentedRealtimeLLMService for audit log interception") - return InstrumentedRealtimeLLMService( - model=model_lower, + service = InstrumentedRealtimeLLMService( + model=params.get("model"), audit_log=audit_log, - api_key=os.environ.get("AZURE_OPENAI_REALTIME_API_KEY"), + api_key=params.get("api_key"), base_url=url, session_properties=session_properties, ) + InstrumentedRealtimeLLMService._connect = override__connect # azure realtime connect + return service - return AzureRealtimeLLMService( - api_key=os.environ.get("AZURE_OPENAI_REALTIME_API_KEY"), + return OpenAIRealtimeLLMService( + api_key=params.get("api_key"), base_url=url, session_properties=session_properties, ) elif model_lower == "ultravox": + logger.info("Using Ultravox LLM") return UltravoxRealtimeLLMService( params=OneShotInputParams( - api_key=os.getenv("ULTRAVOX_API_KEY"), + api_key=params.get("api_key"), system_prompt=system_prompt, temperature=0.3, max_duration=datetime.timedelta(minutes=6), @@ -563,6 +581,26 @@ async def override_run_tts(self, text: str, context_id: str) -> AsyncGenerator[F yield ErrorFrame(error=f"Unknown error occurred: {e}") +async def override__connect(self): + try: + if self._websocket: + # Here we assume that if we have a websocket, we are connected. We + # handle disconnections in the send/recv code paths. + return + + logger.info(f"Connecting to {self.base_url}") + self._websocket = await websocket_connect( + uri=self.base_url, + additional_headers={ + "api-key": self.api_key, + }, + ) + self._receive_task = self.create_task(self._receive_task_handler()) + except Exception as e: + await self.push_error(error_msg=f"initialization error: {e}", exception=e) + self._websocket = None + + # Unicode to ASCII replacements for TTS _TTS_CHAR_MAP = str.maketrans( { diff --git a/src/eva/assistant/server.py b/src/eva/assistant/server.py index 57a0fc2e..4282e894 100644 --- a/src/eva/assistant/server.py +++ b/src/eva/assistant/server.py @@ -326,7 +326,10 @@ async def _realtime_tool_handler(params) -> None: "smart_turn_stop_secs", 0.8 ) # Shorter silence so we don't have to wait 3s if smart turn marks audio as incomplete - if isinstance(self.pipeline_config, PipelineConfig) and self.pipeline_config.turn_strategy == "external": + if ( + isinstance(self.pipeline_config, (PipelineConfig, SpeechToSpeechConfig)) + and self.pipeline_config.turn_strategy == "external" + ): logger.info("Using external user turn strategies") user_turn_strategies = ExternalUserTurnStrategies() vad_analyzer = None @@ -444,9 +447,29 @@ async def on_user_transcription(text: str, timestamp: str, turn_id: int | None) self._latency_measurements = [] async def on_latency_measured(observer, latency_seconds: float): - """Event handler for UserBotLatencyObserver - stores latency measurements.""" - self._latency_measurements.append(latency_seconds) - logger.debug(f"Response latency captured: {latency_seconds:.3f}s") + """Event handler for UserBotLatencyObserver - stores latency measurements. + + For realtime LLM, adds VAD delay to get full user-perceived latency. + For pipecat VAD (non-realtime), uses the latency as-is. + """ + adjusted_latency = latency_seconds + + # Add VAD delay for realtime LLM to get full user-perceived latency + if isinstance(realtime_llm, InstrumentedRealtimeLLMService): + vad_delay_ms = realtime_llm.last_vad_delay_ms + if vad_delay_ms is not None: + vad_delay_s = vad_delay_ms / 1000.0 + adjusted_latency = latency_seconds + vad_delay_s + logger.debug( + f"Response latency captured: {adjusted_latency:.3f}s " + f"(VAD delay: {vad_delay_s:.3f}s + pipecat: {latency_seconds:.3f}s)" + ) + else: + logger.debug(f"Response latency captured: {latency_seconds:.3f}s (no VAD delay available)") + else: + logger.debug(f"Response latency captured: {latency_seconds:.3f}s") + + self._latency_measurements.append(adjusted_latency) user_bot_observer = UserBotLatencyObserver() user_bot_observer.add_event_handler("on_latency_measured", on_latency_measured) diff --git a/src/eva/models/config.py b/src/eva/models/config.py index cd8fe819..99b706b6 100644 --- a/src/eva/models/config.py +++ b/src/eva/models/config.py @@ -97,6 +97,17 @@ class SpeechToSpeechConfig(BaseModel): s2s: str = Field(description="Speech-to-speech model name", examples=["gpt-realtime-mini", "gemini_live"]) s2s_params: dict[str, Any] = Field({}, description="Additional speech-to-speech model parameters (JSON)") + turn_strategy: Literal["smart", "external"] = Field( + "smart", + description=( + "User turn detection strategy. " + "'smart' uses LocalSmartTurnAnalyzerV3 + SileroVAD (default). " + "'external' uses ExternalUserTurnStrategies for services with built-in turn detection " + "(e.g., deepgram-flux, Speechmatics). " + "Set via EVA_MODEL__TURN_STRATEGY=external." + ), + ) + class AudioLLMConfig(BaseModel): """Configuration for an Audio-LLM pipeline (audio in, text out, separate TTS). @@ -129,7 +140,7 @@ class AudioLLMConfig(BaseModel): *PipelineConfig._LEGACY_RENAMES, *PipelineConfig._LEGACY_DROP, } -_S2S_FIELDS = {"s2s", "s2s_params"} +_S2S_FIELDS = {"s2s", "s2s_params", "turn_strategy"} _AUDIO_LLM_FIELDS = {"audio_llm", "audio_llm_params", "tts", "tts_params"} diff --git a/src/eva/utils/prompt_manager.py b/src/eva/utils/prompt_manager.py index 2216fddc..56971149 100644 --- a/src/eva/utils/prompt_manager.py +++ b/src/eva/utils/prompt_manager.py @@ -121,7 +121,7 @@ def get_prompt(self, path: str, **variables) -> str: return value.format(**formatted_vars) except KeyError as e: raise KeyError( - "Missing variable {e} for prompt '{path}'. Available variables: {sorted(formatted_vars.keys())}" + f"Missing variable {e} for prompt '{path}'. Available variables: {sorted(formatted_vars.keys())}" ) from e From 227e008903318fa2e365f0172ace3a7790c892c2 Mon Sep 17 00:00:00 2001 From: Katrina Date: Wed, 25 Mar 2026 20:31:34 -0400 Subject: [PATCH 02/25] add api_key to s2s params example --- .env.example | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.env.example b/.env.example index 7398c5d0..4a434630 100644 --- a/.env.example +++ b/.env.example @@ -172,7 +172,7 @@ EVA_MODEL__LLM=gpt-5.2 # Only needed if benchmarking speech-to-speech models. # EVA_MODEL__S2S=openai -# EVA_MODEL__S2S_PARAMS='{"model": "gpt-realtime-mini", "voice": "marin"}' +# EVA_MODEL__S2S_PARAMS='{"model": "gpt-realtime-mini", "api_key": ""}' # EVA_MODEL__AUDIO_LLM= # EVA_MODEL__AUDIO_LLM_PARAMS='{"url": "", "api_key": ""}' From 59ac784c262cefa0316f8293a124055a6b601cd2 Mon Sep 17 00:00:00 2001 From: Katrina Date: Wed, 25 Mar 2026 20:33:03 -0400 Subject: [PATCH 03/25] add comment --- src/eva/assistant/pipeline/services.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/eva/assistant/pipeline/services.py b/src/eva/assistant/pipeline/services.py index e4ce760c..9cfb9f5c 100644 --- a/src/eva/assistant/pipeline/services.py +++ b/src/eva/assistant/pipeline/services.py @@ -582,6 +582,7 @@ async def override_run_tts(self, text: str, context_id: str) -> AsyncGenerator[F async def override__connect(self): + # Allow connections to azure / other providers using a base_url try: if self._websocket: # Here we assume that if we have a websocket, we are connected. We From e4f81c4d65abadc1c4fccacec37033938b292342 Mon Sep 17 00:00:00 2001 From: Katrina Date: Thu, 26 Mar 2026 11:06:23 -0400 Subject: [PATCH 04/25] force riva client 2.25.0 --- pyproject.toml | 2 +- uv.lock | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index ec59536f..75a7d6b5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -54,7 +54,7 @@ dependencies = [ "jaconv>=0.3.0", "regex>=2023.0.0", "more-itertools>=10.0.0", - "nvidia-riva-client>=2.25.0,<3.0.0" + "nvidia-riva-client>=2.25.0,<2.25.1" ] [project.optional-dependencies] diff --git a/uv.lock b/uv.lock index b7c8efec..a291ea54 100644 --- a/uv.lock +++ b/uv.lock @@ -827,7 +827,7 @@ requires-dist = [ { name = "more-itertools", specifier = ">=10.0.0" }, { name = "mypy", marker = "extra == 'dev'", specifier = ">=1.5" }, { name = "numpy", specifier = ">=1.24" }, - { name = "nvidia-riva-client", specifier = ">=2.25.0,<3.0.0" }, + { name = "nvidia-riva-client", specifier = ">=2.25.0,<2.25.1" }, { name = "onnxruntime", specifier = ">=1.16.0" }, { name = "openai", specifier = ">=1.0.0" }, { name = "pandas", specifier = ">=2.0" }, From f3a1c15b7c5629db1c53f990cf44ea72a60916b9 Mon Sep 17 00:00:00 2001 From: Katrina Date: Thu, 26 Mar 2026 11:26:28 -0400 Subject: [PATCH 05/25] move session_properties object to openai/azure only flows --- src/eva/assistant/pipeline/services.py | 41 +++++++++++++++----------- 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/src/eva/assistant/pipeline/services.py b/src/eva/assistant/pipeline/services.py index 9cfb9f5c..939e9236 100644 --- a/src/eva/assistant/pipeline/services.py +++ b/src/eva/assistant/pipeline/services.py @@ -400,25 +400,8 @@ def create_realtime_llm_service( ) pipecat_tools = ToolsSchema(standard_tools=function_schemas) - session_properties = SessionProperties( - instructions=system_prompt, - audio=AudioConfiguration( - input=AudioInput( - transcription=InputAudioTranscription( - model=params.get("transcription_model", "gpt-4o-mini-transcribe") - ), - # Set openai TurnDetection parameters. Not setting this at all will turn it on by default - turn_detection=SemanticTurnDetection(), - ), - output=AudioOutput( - voice=params.get("voice", "marin"), - ), - ), - tools=pipecat_tools, - tool_choice="auto", - ) - if model_lower.startswith("openai"): + session_properties = get_openai_session_properties(system_prompt, params, pipecat_tools) if audit_log is not None: logger.info( f"Using InstrumentedRealtimeLLMService for audit log interception: openai: {params.get('model')}" @@ -439,6 +422,7 @@ def create_realtime_llm_service( # base_url: The full Azure WebSocket endpoint URL including api-version and deployment. # Example: "wss://my-project.openai.azure.com/openai/v1/realtime" url = params.get("url", "") + session_properties = get_openai_session_properties(system_prompt, params, pipecat_tools) logger.info(f"Using Azure Realtime LLM: {model_lower}, url {url}") @@ -476,6 +460,27 @@ def create_realtime_llm_service( raise ValueError(f"Unknown realtime model: {model}. Available: gpt-realtime, ultravox") +def get_openai_session_properties(system_prompt: str, params: dict, pipecat_tools) -> SessionProperties: + """Create openai compatible session properties object.""" + return SessionProperties( + instructions=system_prompt, + audio=AudioConfiguration( + input=AudioInput( + transcription=InputAudioTranscription( + model=params.get("transcription_model", "gpt-4o-mini-transcribe") + ), + # Set openai TurnDetection parameters. Not setting this at all will turn it on by default + turn_detection=SemanticTurnDetection(), + ), + output=AudioOutput( + voice=params.get("voice", "marin"), + ), + ), + tools=pipecat_tools, + tool_choice="auto", + ) + + def create_audio_llm_client( model: str, params: dict[str, Any], From a13480d0d50d6473797f85855b5cabc70676dd6f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gabrielle=20Gauthier=20Melanc=CC=A7on?= Date: Thu, 26 Mar 2026 18:02:44 -0400 Subject: [PATCH 06/25] Add model names to timestamp --- src/eva/models/config.py | 27 +++++++++++++++++++++++++ tests/unit/models/test_config_models.py | 4 ++-- 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/src/eva/models/config.py b/src/eva/models/config.py index cd8fe819..a674465d 100644 --- a/src/eva/models/config.py +++ b/src/eva/models/config.py @@ -39,6 +39,26 @@ def current_date_and_time(): return f"{datetime.now(UTC):%Y-%m-%d_%H-%M-%S.%f}" +def _model_suffix(model: Any) -> str: + """Build a short suffix from the model config for use in folder names.""" + if isinstance(model, PipelineConfig): + parts = [ + model.stt_params.get("alias") or model.stt_params.get("model") or model.stt or "", + model.llm, + model.tts_params.get("alias") or model.tts_params.get("model") or model.tts or "", + ] + elif isinstance(model, SpeechToSpeechConfig): + parts = [model.s2s_params.get("alias") or model.s2s_params.get("model") or model.s2s] + elif isinstance(model, AudioLLMConfig): + parts = [ + model.audio_llm_params.get("alias") or model.audio_llm_params.get("model") or model.audio_llm, + model.tts_params.get("alias") or model.tts_params.get("model") or model.tts or "", + ] + else: + return "" + return "_".join(p for p in parts if p) + + class PipelineConfig(BaseModel): """Configuration for a STT + LLM + TTS pipeline.""" @@ -452,6 +472,13 @@ def _check_companion_services(self) -> "RunConfig": if not self.model.tts: raise ValueError("EVA_MODEL__TTS is required when using EVA_MODEL__AUDIO_LLM (SpeechLM-TTS pipeline).") self._validate_service_params("TTS", self.model.tts, self.model.tts_params) + + # Append model names to auto-generated run_id + if "run_id" not in self.model_fields_set: + suffix = _model_suffix(self.model) + if suffix: + self.run_id = f"{self.run_id}_{suffix}" + return self # Providers that manage their own model/key resolution (e.g. WebSocket-based) diff --git a/tests/unit/models/test_config_models.py b/tests/unit/models/test_config_models.py index 9b77854c..47ca4873 100644 --- a/tests/unit/models/test_config_models.py +++ b/tests/unit/models/test_config_models.py @@ -2,7 +2,6 @@ import json import os -from datetime import datetime from pathlib import Path from unittest.mock import MagicMock, patch @@ -81,7 +80,8 @@ def test_create_minimal_config(self): assert config.dataset_path == Path("data/airline_dataset.jsonl") assert config.tool_mocks_path == Path("data/airline_scenarios") - assert datetime.strptime(config.run_id, "%Y-%m-%d_%H-%M-%S.%f") + # run_id = timestamp + model suffix (e.g. "2024-01-15_14-30-45.123456_nova-2_gpt-5.2_sonic") + assert config.run_id.endswith("nova-2_gpt-5.2_sonic") assert config.max_concurrent_conversations == 1 assert config.conversation_timeout_seconds == 360 From 44e521f026b1dab47a92b4e42ffc0de65d048c26 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gabrielle=20Gauthier=20Melanc=CC=A7on?= Date: Thu, 26 Mar 2026 18:04:53 -0400 Subject: [PATCH 07/25] Make sure we are not saving api keys in config.json --- src/eva/models/config.py | 12 ++++++++++++ tests/unit/models/test_config_models.py | 8 +++++++- 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/src/eva/models/config.py b/src/eva/models/config.py index a674465d..d9e46867 100644 --- a/src/eva/models/config.py +++ b/src/eva/models/config.py @@ -532,6 +532,18 @@ def _redact_model_list(cls, deployments: list[ModelDeployment]) -> list[dict]: redacted.append(deployment) return redacted + @field_serializer("model") + @classmethod + def _redact_model_params(cls, model: ModelConfigUnion) -> dict: + """Redact secret values in STT/TTS/S2S/AudioLLM params when serializing.""" + data = model.model_dump(mode="json") + for field_name, value in data.items(): + if field_name.endswith("_params") and isinstance(value, dict): + for key in value: + if "key" in key or "credentials" in key: + value[key] = "***" + return data + @classmethod def from_yaml(cls, path: Path | str) -> "RunConfig": """Load configuration from YAML file.""" diff --git a/tests/unit/models/test_config_models.py b/tests/unit/models/test_config_models.py index 47ca4873..3f445544 100644 --- a/tests/unit/models/test_config_models.py +++ b/tests/unit/models/test_config_models.py @@ -160,13 +160,19 @@ def test_indentation_in_model_list(self, tmp_path: Path, vars_location: str, ind assert config.model_list == MODEL_LIST def test_secrets_redacted(self): - """Secrets are redacted in model_list.""" + """Secrets are redacted in model_list and STT/TTS params.""" config = _config(env_vars=_BASE_ENV) dumped = config.model_dump(mode="json") assert dumped["model_list"][0]["litellm_params"]["api_key"] == "***" assert dumped["model_list"][1]["litellm_params"]["vertex_credentials"] == "***" assert dumped["model_list"][2]["litellm_params"]["aws_access_key_id"] == "***" assert dumped["model_list"][2]["litellm_params"]["aws_secret_access_key"] == "***" + # STT/TTS params api_key must also be redacted + assert dumped["model"]["stt_params"]["api_key"] == "***" + assert dumped["model"]["tts_params"]["api_key"] == "***" + # Non-secret fields preserved + assert dumped["model"]["stt_params"]["model"] == "nova-2" + assert dumped["model"]["tts_params"]["model"] == "sonic" @pytest.mark.parametrize( "environ, expected_exception, expected_message", From 1466cec64f470ac6a465c74dec00340f164c62fb Mon Sep 17 00:00:00 2001 From: Katrina Date: Thu, 26 Mar 2026 18:06:01 -0400 Subject: [PATCH 08/25] make api_key required for realtime models --- .../assistant/pipeline/audio_llm_processor.py | 3 +-- src/eva/assistant/pipeline/services.py | 11 +++++------ src/eva/models/config.py | 17 ++++++++++++----- 3 files changed, 18 insertions(+), 13 deletions(-) diff --git a/src/eva/assistant/pipeline/audio_llm_processor.py b/src/eva/assistant/pipeline/audio_llm_processor.py index a9154d4e..bb5b24b3 100644 --- a/src/eva/assistant/pipeline/audio_llm_processor.py +++ b/src/eva/assistant/pipeline/audio_llm_processor.py @@ -19,7 +19,6 @@ import asyncio import base64 import io -import os import time import wave from collections.abc import Awaitable @@ -418,7 +417,7 @@ def __init__( super().__init__(**kwargs) self._audio_collector = audio_collector params = params or {} - self._api_key = params.get("api_key") or os.getenv("OPENAI_API_KEY") + self._api_key = params.get["api_key"] self._model = model self._system_prompt = system_prompt or self.DEFAULT_SYSTEM_PROMPT self._sample_rate = sample_rate diff --git a/src/eva/assistant/pipeline/services.py b/src/eva/assistant/pipeline/services.py index 939e9236..c750d6b2 100644 --- a/src/eva/assistant/pipeline/services.py +++ b/src/eva/assistant/pipeline/services.py @@ -4,7 +4,6 @@ """ import datetime -import os from typing import Any, AsyncGenerator, Optional from deepgram import LiveOptions @@ -409,12 +408,12 @@ def create_realtime_llm_service( return InstrumentedRealtimeLLMService( model=params.get("model"), audit_log=audit_log, - api_key=params.get("api_key") or os.getenv("OPENAI_API_KEY"), + api_key=params.get["api_key"], session_properties=session_properties, ) return OpenAIRealtimeLLMService( - api_key=params.get("api_key"), + api_key=params.get["api_key"], session_properties=session_properties, ) elif model_lower.startswith("azure") or model_lower.startswith("gpt-realtime"): @@ -431,7 +430,7 @@ def create_realtime_llm_service( service = InstrumentedRealtimeLLMService( model=params.get("model"), audit_log=audit_log, - api_key=params.get("api_key"), + api_key=params.get["api_key"], base_url=url, session_properties=session_properties, ) @@ -439,7 +438,7 @@ def create_realtime_llm_service( return service return OpenAIRealtimeLLMService( - api_key=params.get("api_key"), + api_key=params.get["api_key"], base_url=url, session_properties=session_properties, ) @@ -447,7 +446,7 @@ def create_realtime_llm_service( logger.info("Using Ultravox LLM") return UltravoxRealtimeLLMService( params=OneShotInputParams( - api_key=params.get("api_key"), + api_key=params.get["api_key"], system_prompt=system_prompt, temperature=0.3, max_duration=datetime.timedelta(minutes=6), diff --git a/src/eva/models/config.py b/src/eva/models/config.py index 99b706b6..6a7ec0e9 100644 --- a/src/eva/models/config.py +++ b/src/eva/models/config.py @@ -452,28 +452,35 @@ def _warn_deprecated_aliases(cls, data: Any) -> Any: @model_validator(mode="after") def _check_companion_services(self) -> "RunConfig": """Ensure required companion services are set for each pipeline mode.""" + required_keys = ["api_key", "model"] if isinstance(self.model, PipelineConfig): if not self.model.stt: raise ValueError("EVA_MODEL__STT is required when using EVA_MODEL__LLM (ASR-LLM-TTS pipeline).") if not self.model.tts: raise ValueError("EVA_MODEL__TTS is required when using EVA_MODEL__LLM (ASR-LLM-TTS pipeline).") - self._validate_service_params("STT", self.model.stt, self.model.stt_params) - self._validate_service_params("TTS", self.model.tts, self.model.tts_params) + self._validate_service_params("STT", self.model.stt, required_keys, self.model.stt_params) + self._validate_service_params("TTS", self.model.tts, required_keys, self.model.tts_params) elif isinstance(self.model, AudioLLMConfig): if not self.model.tts: raise ValueError("EVA_MODEL__TTS is required when using EVA_MODEL__AUDIO_LLM (SpeechLM-TTS pipeline).") - self._validate_service_params("TTS", self.model.tts, self.model.tts_params) + self._validate_service_params("TTS", self.model.tts, required_keys, self.model.tts_params) + self._validate_service_params("audio_llm", self.model.audio_llm, required_keys, self.model.audio_llm_params) + elif isinstance(self.model, SpeechToSpeechConfig): + # api_key is required, some s2s services don't require model + self._validate_service_params("S2S", self.model.s2s, ["api_key"], self.model.s2s_params) return self # Providers that manage their own model/key resolution (e.g. WebSocket-based) _SKIP_PARAMS_VALIDATION: ClassVar[set[str]] = {"nvidia"} @classmethod - def _validate_service_params(cls, service: str, provider: str, params: dict[str, Any]) -> None: + def _validate_service_params( + cls, service: str, provider: str, required_keys: list[str], params: dict[str, Any] + ) -> None: """Validate that STT/TTS params contain required keys.""" if provider.lower() in cls._SKIP_PARAMS_VALIDATION: return - missing = [key for key in ("api_key", "model") if key not in params] + missing = [key for key in required_keys if key not in params] if missing: missing_str = " and ".join(f'"{k}"' for k in missing) env_var = f"EVA_MODEL__{service}_PARAMS" From 74a1c018ee55d6f0cc0ad2699ae1f60dfaf001be Mon Sep 17 00:00:00 2001 From: Katrina Date: Thu, 26 Mar 2026 18:25:04 -0400 Subject: [PATCH 09/25] fix test now that api_key is mandatory for realtime models --- tests/unit/models/test_config_models.py | 37 +++++++++++++++++-------- 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/tests/unit/models/test_config_models.py b/tests/unit/models/test_config_models.py index 9b77854c..81e4179f 100644 --- a/tests/unit/models/test_config_models.py +++ b/tests/unit/models/test_config_models.py @@ -56,6 +56,10 @@ "EVA_MODEL__STT_PARAMS": json.dumps({"api_key": "test_key", "model": "nova-2"}), "EVA_MODEL__TTS_PARAMS": json.dumps({"api_key": "test_key", "model": "sonic"}), } +_S2S_ENV = _EVA_MODEL_LIST_ENV | { + "EVA_MODEL__S2S": "gpt-realtime-mini", + "EVA_MODEL__S2S_PARAMS": json.dumps({"api_key": ""}), +} def _config( @@ -355,14 +359,14 @@ class TestDeprecatedEnvVars: lambda c: c.model.tts, ), ( - _EVA_MODEL_LIST_ENV, + _S2S_ENV, "REALTIME_MODEL", "EVA_MODEL__S2S", "test-model", lambda c: c.model.s2s, ), ( - _EVA_MODEL_LIST_ENV, + _S2S_ENV, "EVA_MODEL__REALTIME_MODEL", "EVA_MODEL__S2S", "test-model", @@ -383,17 +387,17 @@ class TestDeprecatedEnvVars: lambda c: c.model.tts_params, ), ( - _EVA_MODEL_LIST_ENV | {"EVA_MODEL__S2S": "test-model"}, + _S2S_ENV, "REALTIME_MODEL_PARAMS", "EVA_MODEL__S2S_PARAMS", - {"foo": "bar"}, + {"api_key": "k"}, lambda c: c.model.s2s_params, ), ( - _EVA_MODEL_LIST_ENV | {"EVA_MODEL__S2S": "test-model"}, + _S2S_ENV, "EVA_MODEL__REALTIME_MODEL_PARAMS", "EVA_MODEL__S2S_PARAMS", - {"foo": "bar"}, + {"api_key": "k"}, lambda c: c.model.s2s_params, ), ( @@ -580,7 +584,7 @@ def test_tts_model(self): assert c.model.tts == "cartesia" def test_realtime_model(self): - config = _config(env_vars=_EVA_MODEL_LIST_ENV, cli_args=["--realtime-model", "test-model"]) + config = _config(env_vars=_S2S_ENV, cli_args=["--realtime-model", "test-model"]) assert config.model.s2s == "test-model" def test_domain_cli(self): @@ -656,20 +660,31 @@ class TestSpeechToSpeechConfig: def test_s2s_config_from_env(self): """EVA_MODEL__S2S selects SpeechToSpeechConfig.""" - config = _config(env_vars=_EVA_MODEL_LIST_ENV | {"EVA_MODEL__S2S": "gpt-realtime-mini"}) + config = _config( + env_vars=_EVA_MODEL_LIST_ENV + | { + "EVA_MODEL__S2S": "gpt-realtime-mini", + "EVA_MODEL__S2S_PARAMS": json.dumps({"api_key": ""}), + } + ) assert isinstance(config.model, SpeechToSpeechConfig) assert config.model.s2s == "gpt-realtime-mini" def test_s2s_config_from_cli(self): """--s2s-model selects SpeechToSpeechConfig.""" - config = _config(env_vars=_EVA_MODEL_LIST_ENV, cli_args=["--model.s2s", "gemini_live"]) + config = _config( + env_vars=_EVA_MODEL_LIST_ENV, + cli_args=["--model.s2s", "gemini_live", "--model.s2s-params", '{"api_key": "test-key"}'], + ) assert isinstance(config.model, SpeechToSpeechConfig) assert config.model.s2s == "gemini_live" + assert config.model.s2s_params == {"api_key": "test-key"} def test_s2s_config_with_params(self): """S2S params are passed through.""" config = _config( - env_vars=_EVA_MODEL_LIST_ENV, model={"s2s": "gpt-realtime-mini", "s2s_params": {"voice": "alloy"}} + env_vars=_EVA_MODEL_LIST_ENV, + model={"s2s": "gpt-realtime-mini", "s2s_params": {"voice": "alloy", "api_key": "key_1"}}, ) assert isinstance(config.model, SpeechToSpeechConfig) - assert config.model.s2s_params == {"voice": "alloy"} + assert config.model.s2s_params == {"voice": "alloy", "api_key": "key_1"} From aee752d5ffafe641dc4425a9c484bb9272d6b6c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gabrielle=20Gauthier=20Melanc=CC=A7on?= Date: Thu, 26 Mar 2026 18:25:06 -0400 Subject: [PATCH 10/25] On rerun read the api keys from .env and not config.json --- src/eva/models/config.py | 56 +++++++++++++++++++ src/eva/run_benchmark.py | 3 ++ tests/unit/models/test_config_models.py | 71 +++++++++++++++++++++++++ 3 files changed, 130 insertions(+) diff --git a/src/eva/models/config.py b/src/eva/models/config.py index d9e46867..6e6a4ce7 100644 --- a/src/eva/models/config.py +++ b/src/eva/models/config.py @@ -12,6 +12,7 @@ ``RunConfig(_env_file=".env", _cli_parse_args=True)``. """ +import logging from datetime import UTC, datetime from pathlib import Path from typing import Annotated, Any, ClassVar, Literal @@ -34,6 +35,8 @@ from eva.models.provenance import RunProvenance +logger = logging.getLogger(__name__) + def current_date_and_time(): return f"{datetime.now(UTC):%Y-%m-%d_%H-%M-%S.%f}" @@ -544,6 +547,59 @@ def _redact_model_params(cls, model: ModelConfigUnion) -> dict: value[key] = "***" return data + def restore_redacted_secrets(self, live: "RunConfig") -> None: + """Replace redacted ``***`` values in ``*_params`` dicts with real values from *live* config. + + Raises: + ValueError: If the saved and live configs use different providers or aliases + for any service that has redacted secrets. + """ + # Map each params field to its provider field (e.g. stt_params -> stt) + _PARAMS_TO_PROVIDER = { + "stt_params": "stt", + "tts_params": "tts", + "s2s_params": "s2s", + "audio_llm_params": "audio_llm", + } + for params_field, provider_field in _PARAMS_TO_PROVIDER.items(): + saved = getattr(self.model, params_field, None) + source = getattr(live.model, params_field, None) + if not isinstance(saved, dict) or not isinstance(source, dict): + continue + has_redacted = any(v == "***" for v in saved.values()) + if not has_redacted: + continue + + # Check provider matches (e.g. stt: "deepgram" vs "cartesia") + saved_provider = getattr(self.model, provider_field, None) + live_provider = getattr(live.model, provider_field, None) + if saved_provider != live_provider: + raise ValueError( + f"Cannot restore secrets: saved {provider_field}={saved_provider!r} " + f"but current environment has {provider_field}={live_provider!r}" + ) + + # Check alias matches (strict — aliases identify a specific configuration) + saved_alias = saved.get("alias") + live_alias = source.get("alias") + if saved_alias and live_alias and saved_alias != live_alias: + raise ValueError( + f"Cannot restore secrets: saved {params_field}[alias]={saved_alias!r} " + f"but current environment has {params_field}[alias]={live_alias!r}" + ) + + # Warn if model changed (non-fatal — models can be updated) + saved_model = saved.get("model") + live_model = source.get("model") + if saved_model and live_model and saved_model != live_model: + logger.warning( + f"Model mismatch for {params_field}: saved {saved_model!r}, current environment has {live_model!r}" + ) + + for key, value in saved.items(): + if value == "***" and key in source: + saved[key] = source[key] + @classmethod def from_yaml(cls, path: Path | str) -> "RunConfig": """Load configuration from YAML file.""" diff --git a/src/eva/run_benchmark.py b/src/eva/run_benchmark.py index 92d32b01..78a66843 100644 --- a/src/eva/run_benchmark.py +++ b/src/eva/run_benchmark.py @@ -42,6 +42,9 @@ async def run_benchmark(config: RunConfig) -> int: logger.error(str(e)) return 1 + # Restore secrets redacted in config.json with live env values + runner.config.restore_redacted_secrets(config) + # Apply CLI overrides runner.config.max_rerun_attempts = config.max_rerun_attempts runner.config.force_rerun_metrics = config.force_rerun_metrics diff --git a/tests/unit/models/test_config_models.py b/tests/unit/models/test_config_models.py index 3f445544..69e5fbd2 100644 --- a/tests/unit/models/test_config_models.py +++ b/tests/unit/models/test_config_models.py @@ -174,6 +174,77 @@ def test_secrets_redacted(self): assert dumped["model"]["stt_params"]["model"] == "nova-2" assert dumped["model"]["tts_params"]["model"] == "sonic" + def test_restore_redacted_secrets(self): + """Redacted secrets are restored from a live config.""" + config = _config(env_vars=_BASE_ENV) + # Simulate round-trip through config.json (redacted on dump, loaded back) + dumped_json = config.model_dump_json() + loaded = RunConfig.model_validate_json(dumped_json) + assert loaded.model.stt_params["api_key"] == "***" + assert loaded.model.tts_params["api_key"] == "***" + + # Restore from live config (which has real keys from env) + loaded.restore_redacted_secrets(config) + assert loaded.model.stt_params["api_key"] == "test_key" + assert loaded.model.tts_params["api_key"] == "test_key" + # Non-secret fields unchanged + assert loaded.model.stt_params["model"] == "nova-2" + + def test_restore_redacted_secrets_provider_mismatch(self): + """Restoring secrets fails if the STT/TTS provider changed.""" + config = _config(env_vars=_BASE_ENV) + dumped_json = config.model_dump_json() + loaded = RunConfig.model_validate_json(dumped_json) + + # Live config uses a different STT provider + live = _config( + env_vars=_BASE_ENV + | { + "EVA_MODEL__STT": "openai_whisper", + "EVA_MODEL__STT_PARAMS": json.dumps({"api_key": "k", "model": "whisper-1"}), + } + ) + with pytest.raises(ValueError, match=r"saved stt='deepgram'.*current environment has stt='openai_whisper'"): + loaded.restore_redacted_secrets(live) + + def test_restore_redacted_secrets_model_mismatch_warns(self, caplog): + """Restoring secrets warns (but succeeds) if the STT/TTS model changed.""" + config = _config(env_vars=_BASE_ENV) + dumped_json = config.model_dump_json() + loaded = RunConfig.model_validate_json(dumped_json) + + # Same provider, different model + live = _config(env_vars=_BASE_ENV | {"EVA_MODEL__TTS_PARAMS": json.dumps({"api_key": "k", "model": "sonic-2"})}) + with caplog.at_level("WARNING", logger="eva.models.config"): + loaded.restore_redacted_secrets(live) + assert "sonic" in caplog.text + assert "sonic-2" in caplog.text + # Secrets still restored despite the warning + assert loaded.model.tts_params["api_key"] == "k" + + def test_restore_redacted_secrets_alias_mismatch(self): + """Restoring secrets fails if the alias changed.""" + config = _config( + env_vars=_BASE_ENV + | { + "EVA_MODEL__STT_PARAMS": json.dumps({"api_key": "k", "model": "nova-2", "alias": "stt-v1"}), + } + ) + dumped_json = config.model_dump_json() + loaded = RunConfig.model_validate_json(dumped_json) + + live = _config( + env_vars=_BASE_ENV + | { + "EVA_MODEL__STT_PARAMS": json.dumps({"api_key": "k", "model": "nova-2", "alias": "stt-v2"}), + } + ) + with pytest.raises( + ValueError, + match=r"saved stt_params\[alias\]='stt-v1'.*current environment has stt_params\[alias\]='stt-v2'", + ): + loaded.restore_redacted_secrets(live) + @pytest.mark.parametrize( "environ, expected_exception, expected_message", ( From e8855ab108303460454c193d354ceb893024791a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gabrielle=20Gauthier=20Melanc=CC=A7on?= Date: Thu, 26 Mar 2026 18:55:23 -0400 Subject: [PATCH 11/25] Make sure to not mutate the api keys in memory --- src/eva/models/config.py | 2 ++ tests/unit/models/test_config_models.py | 11 +++++++++++ 2 files changed, 13 insertions(+) diff --git a/src/eva/models/config.py b/src/eva/models/config.py index 6e6a4ce7..e0dac293 100644 --- a/src/eva/models/config.py +++ b/src/eva/models/config.py @@ -12,6 +12,7 @@ ``RunConfig(_env_file=".env", _cli_parse_args=True)``. """ +import copy import logging from datetime import UTC, datetime from pathlib import Path @@ -527,6 +528,7 @@ def _redact_model_list(cls, deployments: list[ModelDeployment]) -> list[dict]: """Redact secret values in litellm_params when serializing.""" redacted = [] for deployment in deployments: + deployment = copy.deepcopy(deployment) if "litellm_params" in deployment: params = deployment["litellm_params"] for key in params: diff --git a/tests/unit/models/test_config_models.py b/tests/unit/models/test_config_models.py index 69e5fbd2..079ce7e6 100644 --- a/tests/unit/models/test_config_models.py +++ b/tests/unit/models/test_config_models.py @@ -174,6 +174,17 @@ def test_secrets_redacted(self): assert dumped["model"]["stt_params"]["model"] == "nova-2" assert dumped["model"]["tts_params"]["model"] == "sonic" + def test_secrets_redaction_does_not_mutate_live_config(self): + """Serializing must not corrupt the in-memory config objects.""" + config = _config(env_vars=_BASE_ENV) + config.model_dump(mode="json") + # model_list keys must still hold real values + assert config.model_list[0]["litellm_params"]["api_key"] == "must_be_redacted" + assert config.model_list[1]["litellm_params"]["vertex_credentials"] == "must_be_redacted" + # STT/TTS params must still hold real values + assert config.model.stt_params["api_key"] == "test_key" + assert config.model.tts_params["api_key"] == "test_key" + def test_restore_redacted_secrets(self): """Redacted secrets are restored from a live config.""" config = _config(env_vars=_BASE_ENV) From 0f3a04b2c55e301c2217e90938d05a77fc007fd5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gabrielle=20Gauthier=20Melanc=CC=A7on?= Date: Thu, 26 Mar 2026 19:13:58 -0400 Subject: [PATCH 12/25] Use same strategy for litellm --- src/eva/models/config.py | 41 ++++++++++---- tests/unit/models/test_config_models.py | 72 ++++++++++++++++++------- 2 files changed, 83 insertions(+), 30 deletions(-) diff --git a/src/eva/models/config.py b/src/eva/models/config.py index e0dac293..c7a67f8e 100644 --- a/src/eva/models/config.py +++ b/src/eva/models/config.py @@ -550,13 +550,15 @@ def _redact_model_params(cls, model: ModelConfigUnion) -> dict: return data def restore_redacted_secrets(self, live: "RunConfig") -> None: - """Replace redacted ``***`` values in ``*_params`` dicts with real values from *live* config. + """Replace ``***`` values in this config with real values from *live*. + + Covers both ``model.*_params`` (STT/TTS/S2S/AudioLLM secrets) and + ``model_list[].litellm_params`` (LLM deployment secrets). Raises: - ValueError: If the saved and live configs use different providers or aliases - for any service that has redacted secrets. + ValueError: If provider or alias differs for a service with redacted secrets. """ - # Map each params field to its provider field (e.g. stt_params -> stt) + # ── model.*_params (STT / TTS / S2S / AudioLLM) ── _PARAMS_TO_PROVIDER = { "stt_params": "stt", "tts_params": "tts", @@ -568,11 +570,9 @@ def restore_redacted_secrets(self, live: "RunConfig") -> None: source = getattr(live.model, params_field, None) if not isinstance(saved, dict) or not isinstance(source, dict): continue - has_redacted = any(v == "***" for v in saved.values()) - if not has_redacted: + if not any(v == "***" for v in saved.values()): continue - # Check provider matches (e.g. stt: "deepgram" vs "cartesia") saved_provider = getattr(self.model, provider_field, None) live_provider = getattr(live.model, provider_field, None) if saved_provider != live_provider: @@ -581,7 +581,6 @@ def restore_redacted_secrets(self, live: "RunConfig") -> None: f"but current environment has {provider_field}={live_provider!r}" ) - # Check alias matches (strict — aliases identify a specific configuration) saved_alias = saved.get("alias") live_alias = source.get("alias") if saved_alias and live_alias and saved_alias != live_alias: @@ -590,18 +589,40 @@ def restore_redacted_secrets(self, live: "RunConfig") -> None: f"but current environment has {params_field}[alias]={live_alias!r}" ) - # Warn if model changed (non-fatal — models can be updated) saved_model = saved.get("model") live_model = source.get("model") if saved_model and live_model and saved_model != live_model: logger.warning( - f"Model mismatch for {params_field}: saved {saved_model!r}, current environment has {live_model!r}" + "Model mismatch for %s: saved %r, current environment has %r", + params_field, + saved_model, + live_model, ) for key, value in saved.items(): if value == "***" and key in source: saved[key] = source[key] + # ── model_list[].litellm_params (LLM deployments) ── + live_by_name = {d["model_name"]: d for d in live.model_list if "model_name" in d} + for deployment in self.model_list: + name = deployment.get("model_name") + if not name: + continue + saved_params = deployment.get("litellm_params", {}) + has_redacted = any(v == "***" for v in saved_params.values()) + if not has_redacted: + continue + if name not in live_by_name: + raise ValueError( + f"Cannot restore secrets: deployment {name!r} not found in " + f"current EVA_MODEL_LIST (available: {list(live_by_name)})" + ) + live_params = live_by_name[name].get("litellm_params", {}) + for key, value in saved_params.items(): + if value == "***" and key in live_params: + saved_params[key] = live_params[key] + @classmethod def from_yaml(cls, path: Path | str) -> "RunConfig": """Load configuration from YAML file.""" diff --git a/tests/unit/models/test_config_models.py b/tests/unit/models/test_config_models.py index 079ce7e6..7b00573e 100644 --- a/tests/unit/models/test_config_models.py +++ b/tests/unit/models/test_config_models.py @@ -186,20 +186,29 @@ def test_secrets_redaction_does_not_mutate_live_config(self): assert config.model.tts_params["api_key"] == "test_key" def test_restore_redacted_secrets(self): - """Redacted secrets are restored from a live config.""" + """Redacted secrets are restored from a live config for both model and model_list.""" config = _config(env_vars=_BASE_ENV) - # Simulate round-trip through config.json (redacted on dump, loaded back) dumped_json = config.model_dump_json() loaded = RunConfig.model_validate_json(dumped_json) + + # Everything is redacted after round-trip assert loaded.model.stt_params["api_key"] == "***" assert loaded.model.tts_params["api_key"] == "***" + assert loaded.model_list[0]["litellm_params"]["api_key"] == "***" + assert loaded.model_list[1]["litellm_params"]["vertex_credentials"] == "***" + assert loaded.model_list[2]["litellm_params"]["aws_access_key_id"] == "***" - # Restore from live config (which has real keys from env) loaded.restore_redacted_secrets(config) + + # STT/TTS params restored assert loaded.model.stt_params["api_key"] == "test_key" assert loaded.model.tts_params["api_key"] == "test_key" - # Non-secret fields unchanged assert loaded.model.stt_params["model"] == "nova-2" + # model_list restored + assert loaded.model_list[0]["litellm_params"]["api_key"] == "must_be_redacted" + assert loaded.model_list[1]["litellm_params"]["vertex_credentials"] == "must_be_redacted" + assert loaded.model_list[2]["litellm_params"]["aws_access_key_id"] == "must_be_redacted" + assert loaded.model_list[2]["litellm_params"]["aws_secret_access_key"] == "must_be_redacted" def test_restore_redacted_secrets_provider_mismatch(self): """Restoring secrets fails if the STT/TTS provider changed.""" @@ -207,7 +216,6 @@ def test_restore_redacted_secrets_provider_mismatch(self): dumped_json = config.model_dump_json() loaded = RunConfig.model_validate_json(dumped_json) - # Live config uses a different STT provider live = _config( env_vars=_BASE_ENV | { @@ -218,21 +226,6 @@ def test_restore_redacted_secrets_provider_mismatch(self): with pytest.raises(ValueError, match=r"saved stt='deepgram'.*current environment has stt='openai_whisper'"): loaded.restore_redacted_secrets(live) - def test_restore_redacted_secrets_model_mismatch_warns(self, caplog): - """Restoring secrets warns (but succeeds) if the STT/TTS model changed.""" - config = _config(env_vars=_BASE_ENV) - dumped_json = config.model_dump_json() - loaded = RunConfig.model_validate_json(dumped_json) - - # Same provider, different model - live = _config(env_vars=_BASE_ENV | {"EVA_MODEL__TTS_PARAMS": json.dumps({"api_key": "k", "model": "sonic-2"})}) - with caplog.at_level("WARNING", logger="eva.models.config"): - loaded.restore_redacted_secrets(live) - assert "sonic" in caplog.text - assert "sonic-2" in caplog.text - # Secrets still restored despite the warning - assert loaded.model.tts_params["api_key"] == "k" - def test_restore_redacted_secrets_alias_mismatch(self): """Restoring secrets fails if the alias changed.""" config = _config( @@ -256,6 +249,45 @@ def test_restore_redacted_secrets_alias_mismatch(self): ): loaded.restore_redacted_secrets(live) + def test_restore_redacted_secrets_model_mismatch_warns(self, caplog): + """Restoring secrets warns (but succeeds) if the STT/TTS model changed.""" + config = _config(env_vars=_BASE_ENV) + dumped_json = config.model_dump_json() + loaded = RunConfig.model_validate_json(dumped_json) + + live = _config(env_vars=_BASE_ENV | {"EVA_MODEL__TTS_PARAMS": json.dumps({"api_key": "k", "model": "sonic-2"})}) + with caplog.at_level("WARNING", logger="eva.models.config"): + loaded.restore_redacted_secrets(live) + assert "sonic" in caplog.text + assert "sonic-2" in caplog.text + assert loaded.model.tts_params["api_key"] == "k" + + def test_restore_redacted_secrets_llm_deployment_mismatch(self): + """Restoring secrets fails if a saved LLM deployment is missing from the live model_list.""" + config = _config(env_vars=_BASE_ENV) + dumped_json = config.model_dump_json() + loaded = RunConfig.model_validate_json(dumped_json) + + # Live config has a different model_list (only one deployment, different name) + different_model_list = [ + { + "model_name": "gpt-4o", + "litellm_params": {"model": "openai/gpt-4o", "api_key": "real_key"}, + } + ] + live = _config( + env_vars={ + "EVA_MODEL_LIST": json.dumps(different_model_list), + "EVA_MODEL__LLM": "gpt-4o", + "EVA_MODEL__STT": "deepgram", + "EVA_MODEL__TTS": "cartesia", + "EVA_MODEL__STT_PARAMS": json.dumps({"api_key": "k", "model": "nova-2"}), + "EVA_MODEL__TTS_PARAMS": json.dumps({"api_key": "k", "model": "sonic"}), + } + ) + with pytest.raises(ValueError, match=r"deployment 'gpt-5.2' not found in current EVA_MODEL_LIST"): + loaded.restore_redacted_secrets(live) + @pytest.mark.parametrize( "environ, expected_exception, expected_message", ( From ebd0a2691485ccb2d6a41699f72f63ec2ba68f9e Mon Sep 17 00:00:00 2001 From: tara-servicenow Date: Thu, 26 Mar 2026 17:09:05 -0700 Subject: [PATCH 13/25] Fix param get --- src/eva/assistant/pipeline/services.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/eva/assistant/pipeline/services.py b/src/eva/assistant/pipeline/services.py index 328b18aa..1b735824 100644 --- a/src/eva/assistant/pipeline/services.py +++ b/src/eva/assistant/pipeline/services.py @@ -418,12 +418,12 @@ def create_realtime_llm_service( return InstrumentedRealtimeLLMService( model=params.get("model"), audit_log=audit_log, - api_key=params.get["api_key"], + api_key=params["api_key"], session_properties=session_properties, ) return OpenAIRealtimeLLMService( - api_key=params.get["api_key"], + api_key=params["api_key"], session_properties=session_properties, ) elif model_lower.startswith("azure") or model_lower.startswith("gpt-realtime"): @@ -440,7 +440,7 @@ def create_realtime_llm_service( service = InstrumentedRealtimeLLMService( model=params.get("model"), audit_log=audit_log, - api_key=params.get["api_key"], + api_key=params["api_key"], base_url=url, session_properties=session_properties, ) @@ -448,7 +448,7 @@ def create_realtime_llm_service( return service return OpenAIRealtimeLLMService( - api_key=params.get["api_key"], + api_key=params["api_key"], base_url=url, session_properties=session_properties, ) @@ -456,7 +456,7 @@ def create_realtime_llm_service( logger.info("Using Ultravox LLM") return UltravoxRealtimeLLMService( params=OneShotInputParams( - api_key=params.get["api_key"], + api_key=params["api_key"], system_prompt=system_prompt, temperature=0.3, max_duration=datetime.timedelta(minutes=6), From 57299567a7074a3d4c57045c3d7df5831e7107d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gabrielle=20Gauthier=20Melanc=CC=A7on?= Date: Mon, 30 Mar 2026 16:58:21 -0400 Subject: [PATCH 14/25] Refactor pipeline name --- src/eva/models/config.py | 49 +++++++++++++++++++--------------- src/eva/orchestrator/runner.py | 4 ++- 2 files changed, 31 insertions(+), 22 deletions(-) diff --git a/src/eva/models/config.py b/src/eva/models/config.py index c7a67f8e..7cac90c4 100644 --- a/src/eva/models/config.py +++ b/src/eva/models/config.py @@ -43,24 +43,9 @@ def current_date_and_time(): return f"{datetime.now(UTC):%Y-%m-%d_%H-%M-%S.%f}" -def _model_suffix(model: Any) -> str: - """Build a short suffix from the model config for use in folder names.""" - if isinstance(model, PipelineConfig): - parts = [ - model.stt_params.get("alias") or model.stt_params.get("model") or model.stt or "", - model.llm, - model.tts_params.get("alias") or model.tts_params.get("model") or model.tts or "", - ] - elif isinstance(model, SpeechToSpeechConfig): - parts = [model.s2s_params.get("alias") or model.s2s_params.get("model") or model.s2s] - elif isinstance(model, AudioLLMConfig): - parts = [ - model.audio_llm_params.get("alias") or model.audio_llm_params.get("model") or model.audio_llm, - model.tts_params.get("alias") or model.tts_params.get("model") or model.tts or "", - ] - else: - return "" - return "_".join(p for p in parts if p) +def _param_alias(params: dict[str, Any]) -> str: + """Return the display alias from a params dict.""" + return params.get("alias") or params.get("model") or "" class PipelineConfig(BaseModel): @@ -97,6 +82,16 @@ class PipelineConfig(BaseModel): ), ) + @property + def pipeline_name(self) -> str: + """Short name for use in folder names.""" + parts = [ + _param_alias(self.stt_params) or self.stt or "", + self.llm, + _param_alias(self.tts_params) or self.tts or "", + ] + return "_".join(p for p in parts if p) + @model_validator(mode="before") @classmethod def _migrate_legacy_fields(cls, data: Any) -> Any: @@ -121,6 +116,11 @@ class SpeechToSpeechConfig(BaseModel): s2s: str = Field(description="Speech-to-speech model name", examples=["gpt-realtime-mini", "gemini_live"]) s2s_params: dict[str, Any] = Field({}, description="Additional speech-to-speech model parameters (JSON)") + @property + def pipeline_name(self) -> str: + """Short name for use in folder names.""" + return _param_alias(self.s2s_params) or self.s2s + class AudioLLMConfig(BaseModel): """Configuration for an Audio-LLM pipeline (audio in, text out, separate TTS). @@ -142,6 +142,15 @@ class AudioLLMConfig(BaseModel): tts: str | None = Field(None, description="TTS model", examples=["cartesia", "elevenlabs"]) tts_params: dict[str, Any] = Field({}, description="Additional TTS model parameters (JSON)") + @property + def pipeline_name(self) -> str: + """Short name for use in folder names.""" + parts = [ + _param_alias(self.audio_llm_params) or self.audio_llm, + _param_alias(self.tts_params) or self.tts or "", + ] + return "_".join(p for p in parts if p) + _PIPELINE_FIELDS = { "llm", @@ -479,9 +488,7 @@ def _check_companion_services(self) -> "RunConfig": # Append model names to auto-generated run_id if "run_id" not in self.model_fields_set: - suffix = _model_suffix(self.model) - if suffix: - self.run_id = f"{self.run_id}_{suffix}" + self.run_id = f"{self.run_id}_{self.model.pipeline_name}" return self diff --git a/src/eva/orchestrator/runner.py b/src/eva/orchestrator/runner.py index f92d98af..6507dace 100644 --- a/src/eva/orchestrator/runner.py +++ b/src/eva/orchestrator/runner.py @@ -138,7 +138,9 @@ async def run(self, records: list[EvaluationRecord]) -> RunResult: } config_path = self.output_dir / "config.json" - config_path.write_text(self.config.model_dump_json(indent=2)) + config_data = self.config.model_dump(mode="json") + config_data["pipeline_name"] = self.config.model.pipeline_name + config_path.write_text(json.dumps(config_data, indent=2)) # Build output_id list for tracking (supports pass@k) num_trials = self.config.num_trials From a10e2cbe5ee311f1398eddd70736a948dcc81090 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gabrielle=20Gauthier=20Melanc=CC=A7on?= Date: Mon, 30 Mar 2026 17:04:08 -0400 Subject: [PATCH 15/25] Saving parts rather than name --- src/eva/models/config.py | 37 +++++++++++++++++----------------- src/eva/orchestrator/runner.py | 3 ++- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/src/eva/models/config.py b/src/eva/models/config.py index 7cac90c4..2ed25490 100644 --- a/src/eva/models/config.py +++ b/src/eva/models/config.py @@ -83,14 +83,13 @@ class PipelineConfig(BaseModel): ) @property - def pipeline_name(self) -> str: - """Short name for use in folder names.""" - parts = [ - _param_alias(self.stt_params) or self.stt or "", - self.llm, - _param_alias(self.tts_params) or self.tts or "", - ] - return "_".join(p for p in parts if p) + def pipeline_parts(self) -> dict[str, str]: + """Component names for this pipeline.""" + return { + "stt": _param_alias(self.stt_params) or self.stt or "", + "llm": self.llm, + "tts": _param_alias(self.tts_params) or self.tts or "", + } @model_validator(mode="before") @classmethod @@ -117,9 +116,9 @@ class SpeechToSpeechConfig(BaseModel): s2s_params: dict[str, Any] = Field({}, description="Additional speech-to-speech model parameters (JSON)") @property - def pipeline_name(self) -> str: - """Short name for use in folder names.""" - return _param_alias(self.s2s_params) or self.s2s + def pipeline_parts(self) -> dict[str, str]: + """Component names for this pipeline.""" + return {"s2s": _param_alias(self.s2s_params) or self.s2s} class AudioLLMConfig(BaseModel): @@ -143,13 +142,12 @@ class AudioLLMConfig(BaseModel): tts_params: dict[str, Any] = Field({}, description="Additional TTS model parameters (JSON)") @property - def pipeline_name(self) -> str: - """Short name for use in folder names.""" - parts = [ - _param_alias(self.audio_llm_params) or self.audio_llm, - _param_alias(self.tts_params) or self.tts or "", - ] - return "_".join(p for p in parts if p) + def pipeline_parts(self) -> dict[str, str]: + """Component names for this pipeline.""" + return { + "audio_llm": _param_alias(self.audio_llm_params) or self.audio_llm, + "tts": _param_alias(self.tts_params) or self.tts or "", + } _PIPELINE_FIELDS = { @@ -488,7 +486,8 @@ def _check_companion_services(self) -> "RunConfig": # Append model names to auto-generated run_id if "run_id" not in self.model_fields_set: - self.run_id = f"{self.run_id}_{self.model.pipeline_name}" + suffix = "_".join(v for v in self.model.pipeline_parts.values() if v) + self.run_id = f"{self.run_id}_{suffix}" return self diff --git a/src/eva/orchestrator/runner.py b/src/eva/orchestrator/runner.py index 6507dace..ac5a45f3 100644 --- a/src/eva/orchestrator/runner.py +++ b/src/eva/orchestrator/runner.py @@ -139,7 +139,8 @@ async def run(self, records: list[EvaluationRecord]) -> RunResult: config_path = self.output_dir / "config.json" config_data = self.config.model_dump(mode="json") - config_data["pipeline_name"] = self.config.model.pipeline_name + pipeline_parts = self.config.model.pipeline_parts + config_data["pipeline_parts"] = pipeline_parts config_path.write_text(json.dumps(config_data, indent=2)) # Build output_id list for tracking (supports pass@k) From 98c6bbf2e63a95d04a86d7856f13dce5cdcc1a1d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gabrielle=20Gauthier=20Melanc=CC=A7on?= Date: Tue, 31 Mar 2026 13:50:28 -0400 Subject: [PATCH 16/25] Read url from .env file and rename --- src/eva/models/config.py | 120 +++++++++++++++--------- src/eva/run_benchmark.py | 4 +- tests/unit/models/test_config_models.py | 56 +++++++++-- 3 files changed, 125 insertions(+), 55 deletions(-) diff --git a/src/eva/models/config.py b/src/eva/models/config.py index 2ed25490..34ac1ae6 100644 --- a/src/eva/models/config.py +++ b/src/eva/models/config.py @@ -300,6 +300,21 @@ class RunConfig(BaseSettings): "EVA_METRICS_TO_RUN": "EVA_METRICS", } + # Providers that manage their own model/key resolution (e.g. WebSocket-based) + _SKIP_PARAMS_VALIDATION: ClassVar[set[str]] = {"nvidia"} + + # Maps *_params field names to their provider field for env override logic + _PARAMS_TO_PROVIDER: ClassVar[dict[str, str]] = { + "stt_params": "stt", + "tts_params": "tts", + "s2s_params": "s2s", + "audio_llm_params": "audio_llm", + } + # Keys always read from the live environment (not persisted across runs) + _ENV_OVERRIDE_KEYS: ClassVar[set[str]] = {"url", "urls"} + # Substrings that identify secret keys (redacted in logs and config.json) + _SECRET_KEY_PATTERNS: ClassVar[set[str]] = {"key", "credentials", "secret"} + class ModelDeployment(DeploymentTypedDict): """DeploymentTypedDict that preserves extra keys in litellm_params.""" @@ -491,9 +506,6 @@ def _check_companion_services(self) -> "RunConfig": return self - # Providers that manage their own model/key resolution (e.g. WebSocket-based) - _SKIP_PARAMS_VALIDATION: ClassVar[set[str]] = {"nvidia"} - @classmethod def _validate_service_params(cls, service: str, provider: str, params: dict[str, Any]) -> None: """Validate that STT/TTS params contain required keys.""" @@ -555,58 +567,65 @@ def _redact_model_params(cls, model: ModelConfigUnion) -> dict: value[key] = "***" return data - def restore_redacted_secrets(self, live: "RunConfig") -> None: - """Replace ``***`` values in this config with real values from *live*. + def apply_env_overrides(self, live: "RunConfig") -> None: + """Apply environment-dependent values from *live* config onto this (saved) config. - Covers both ``model.*_params`` (STT/TTS/S2S/AudioLLM secrets) and - ``model_list[].litellm_params`` (LLM deployment secrets). + Restores redacted secrets (``***``) and overrides dynamic fields (``url``, + ``urls``) in ``model.*_params`` and ``model_list[].litellm_params``. Raises: ValueError: If provider or alias differs for a service with redacted secrets. """ # ── model.*_params (STT / TTS / S2S / AudioLLM) ── - _PARAMS_TO_PROVIDER = { - "stt_params": "stt", - "tts_params": "tts", - "s2s_params": "s2s", - "audio_llm_params": "audio_llm", - } - for params_field, provider_field in _PARAMS_TO_PROVIDER.items(): + for params_field, provider_field in self._PARAMS_TO_PROVIDER.items(): saved = getattr(self.model, params_field, None) source = getattr(live.model, params_field, None) if not isinstance(saved, dict) or not isinstance(source, dict): continue - if not any(v == "***" for v in saved.values()): - continue - - saved_provider = getattr(self.model, provider_field, None) - live_provider = getattr(live.model, provider_field, None) - if saved_provider != live_provider: - raise ValueError( - f"Cannot restore secrets: saved {provider_field}={saved_provider!r} " - f"but current environment has {provider_field}={live_provider!r}" - ) - - saved_alias = saved.get("alias") - live_alias = source.get("alias") - if saved_alias and live_alias and saved_alias != live_alias: - raise ValueError( - f"Cannot restore secrets: saved {params_field}[alias]={saved_alias!r} " - f"but current environment has {params_field}[alias]={live_alias!r}" - ) - saved_model = saved.get("model") - live_model = source.get("model") - if saved_model and live_model and saved_model != live_model: - logger.warning( - "Model mismatch for %s: saved %r, current environment has %r", - params_field, - saved_model, - live_model, - ) + has_redacted = any(v == "***" for v in saved.values()) + has_env_overrides = any(k in saved or k in source for k in self._ENV_OVERRIDE_KEYS) + if not has_redacted and not has_env_overrides: + continue - for key, value in saved.items(): - if value == "***" and key in source: + if has_redacted: + saved_provider = getattr(self.model, provider_field, None) + live_provider = getattr(live.model, provider_field, None) + if saved_provider != live_provider: + raise ValueError( + f"Cannot restore secrets: saved {provider_field}={saved_provider!r} " + f"but current environment has {provider_field}={live_provider!r}" + ) + + saved_alias = saved.get("alias") + live_alias = source.get("alias") + if saved_alias and live_alias and saved_alias != live_alias: + raise ValueError( + f"Cannot restore secrets: saved {params_field}[alias]={saved_alias!r} " + f"but current environment has {params_field}[alias]={live_alias!r}" + ) + + saved_model = saved.get("model") + live_model = source.get("model") + if saved_model and live_model and saved_model != live_model: + logger.warning( + f"Model mismatch for {params_field}: saved {saved_model!r}, " + f"current environment has {live_model!r}" + ) + + for key, value in saved.items(): + if value == "***" and key in source: + saved[key] = source[key] + + # Always use url/urls from the live environment + for key in self._ENV_OVERRIDE_KEYS: + if key in source: + saved_val = saved.get(key) + if saved_val and saved_val != source[key]: + logger.warning( + f"{params_field}[{key}] differs: saved {saved_val!r}, " + f"using {source[key]!r} from current environment" + ) saved[key] = source[key] # ── model_list[].litellm_params (LLM deployments) ── @@ -629,6 +648,21 @@ def restore_redacted_secrets(self, live: "RunConfig") -> None: if value == "***" and key in live_params: saved_params[key] = live_params[key] + # ── Log resolved configuration ── + def _safe_params(p: dict) -> dict: + return {k: "***" if any(s in k for s in self._SECRET_KEY_PATTERNS) else v for k, v in p.items()} + + for params_field, provider_field in self._PARAMS_TO_PROVIDER.items(): + params = getattr(self.model, params_field, None) + provider = getattr(self.model, provider_field, None) + if isinstance(params, dict) and params: + logger.info(f"Resolved {provider_field} ({provider}): {_safe_params(params)}") + + for deployment in self.model_list: + name = deployment.get("model_name", "?") + params = deployment.get("litellm_params", {}) + logger.info(f"Resolved deployment {name}: {_safe_params(params)}") + @classmethod def from_yaml(cls, path: Path | str) -> "RunConfig": """Load configuration from YAML file.""" diff --git a/src/eva/run_benchmark.py b/src/eva/run_benchmark.py index 78a66843..49096448 100644 --- a/src/eva/run_benchmark.py +++ b/src/eva/run_benchmark.py @@ -42,8 +42,8 @@ async def run_benchmark(config: RunConfig) -> int: logger.error(str(e)) return 1 - # Restore secrets redacted in config.json with live env values - runner.config.restore_redacted_secrets(config) + # Apply env-dependent values (secrets, urls) from live env onto saved config + runner.config.apply_env_overrides(config) # Apply CLI overrides runner.config.max_rerun_attempts = config.max_rerun_attempts diff --git a/tests/unit/models/test_config_models.py b/tests/unit/models/test_config_models.py index 7b00573e..4a2c22e4 100644 --- a/tests/unit/models/test_config_models.py +++ b/tests/unit/models/test_config_models.py @@ -185,7 +185,7 @@ def test_secrets_redaction_does_not_mutate_live_config(self): assert config.model.stt_params["api_key"] == "test_key" assert config.model.tts_params["api_key"] == "test_key" - def test_restore_redacted_secrets(self): + def test_apply_env_overrides(self): """Redacted secrets are restored from a live config for both model and model_list.""" config = _config(env_vars=_BASE_ENV) dumped_json = config.model_dump_json() @@ -198,7 +198,7 @@ def test_restore_redacted_secrets(self): assert loaded.model_list[1]["litellm_params"]["vertex_credentials"] == "***" assert loaded.model_list[2]["litellm_params"]["aws_access_key_id"] == "***" - loaded.restore_redacted_secrets(config) + loaded.apply_env_overrides(config) # STT/TTS params restored assert loaded.model.stt_params["api_key"] == "test_key" @@ -210,7 +210,7 @@ def test_restore_redacted_secrets(self): assert loaded.model_list[2]["litellm_params"]["aws_access_key_id"] == "must_be_redacted" assert loaded.model_list[2]["litellm_params"]["aws_secret_access_key"] == "must_be_redacted" - def test_restore_redacted_secrets_provider_mismatch(self): + def test_apply_env_overrides_provider_mismatch(self): """Restoring secrets fails if the STT/TTS provider changed.""" config = _config(env_vars=_BASE_ENV) dumped_json = config.model_dump_json() @@ -224,9 +224,9 @@ def test_restore_redacted_secrets_provider_mismatch(self): } ) with pytest.raises(ValueError, match=r"saved stt='deepgram'.*current environment has stt='openai_whisper'"): - loaded.restore_redacted_secrets(live) + loaded.apply_env_overrides(live) - def test_restore_redacted_secrets_alias_mismatch(self): + def test_apply_env_overrides_alias_mismatch(self): """Restoring secrets fails if the alias changed.""" config = _config( env_vars=_BASE_ENV @@ -247,9 +247,9 @@ def test_restore_redacted_secrets_alias_mismatch(self): ValueError, match=r"saved stt_params\[alias\]='stt-v1'.*current environment has stt_params\[alias\]='stt-v2'", ): - loaded.restore_redacted_secrets(live) + loaded.apply_env_overrides(live) - def test_restore_redacted_secrets_model_mismatch_warns(self, caplog): + def test_apply_env_overrides_model_mismatch_warns(self, caplog): """Restoring secrets warns (but succeeds) if the STT/TTS model changed.""" config = _config(env_vars=_BASE_ENV) dumped_json = config.model_dump_json() @@ -257,12 +257,48 @@ def test_restore_redacted_secrets_model_mismatch_warns(self, caplog): live = _config(env_vars=_BASE_ENV | {"EVA_MODEL__TTS_PARAMS": json.dumps({"api_key": "k", "model": "sonic-2"})}) with caplog.at_level("WARNING", logger="eva.models.config"): - loaded.restore_redacted_secrets(live) + loaded.apply_env_overrides(live) assert "sonic" in caplog.text assert "sonic-2" in caplog.text assert loaded.model.tts_params["api_key"] == "k" - def test_restore_redacted_secrets_llm_deployment_mismatch(self): + def test_apply_env_overrides_url_from_env(self, caplog): + """Url is always taken from the live env, with a warning if it differs.""" + saved_env = _BASE_ENV | { + "EVA_MODEL__STT_PARAMS": json.dumps({"api_key": "k", "model": "nova-2", "url": "wss://old-host/stt"}), + } + config = _config(env_vars=saved_env) + dumped_json = config.model_dump_json() + loaded = RunConfig.model_validate_json(dumped_json) + + # Live env has a different url + live_env = _BASE_ENV | { + "EVA_MODEL__STT_PARAMS": json.dumps({"api_key": "k", "model": "nova-2", "url": "wss://new-host/stt"}), + } + live = _config(env_vars=live_env) + + with caplog.at_level("WARNING", logger="eva.models.config"): + loaded.apply_env_overrides(live) + + assert loaded.model.stt_params["url"] == "wss://new-host/stt" + assert "wss://old-host/stt" in caplog.text + assert "wss://new-host/stt" in caplog.text + + def test_apply_env_overrides_url_added_from_env(self): + """Url from live env is added even if the saved config didn't have one.""" + config = _config(env_vars=_BASE_ENV) + dumped_json = config.model_dump_json() + loaded = RunConfig.model_validate_json(dumped_json) + + live_env = _BASE_ENV | { + "EVA_MODEL__STT_PARAMS": json.dumps({"api_key": "k", "model": "nova-2", "url": "wss://new-host/stt"}), + } + live = _config(env_vars=live_env) + loaded.apply_env_overrides(live) + + assert loaded.model.stt_params["url"] == "wss://new-host/stt" + + def test_apply_env_overrides_llm_deployment_mismatch(self): """Restoring secrets fails if a saved LLM deployment is missing from the live model_list.""" config = _config(env_vars=_BASE_ENV) dumped_json = config.model_dump_json() @@ -286,7 +322,7 @@ def test_restore_redacted_secrets_llm_deployment_mismatch(self): } ) with pytest.raises(ValueError, match=r"deployment 'gpt-5.2' not found in current EVA_MODEL_LIST"): - loaded.restore_redacted_secrets(live) + loaded.apply_env_overrides(live) @pytest.mark.parametrize( "environ, expected_exception, expected_message", From d366872fa694814a7b0f0f822448442753bbb982 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gabrielle=20Gauthier=20Melanc=CC=A7on?= Date: Tue, 31 Mar 2026 14:44:56 -0400 Subject: [PATCH 17/25] Address feebdack --- src/eva/models/config.py | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/src/eva/models/config.py b/src/eva/models/config.py index 34ac1ae6..a5022341 100644 --- a/src/eva/models/config.py +++ b/src/eva/models/config.py @@ -540,6 +540,16 @@ def _expand_metrics_all(cls, v: list[str] | None) -> list[str] | None: return [m for m in get_global_registry().list_metrics() if m not in cls._VALIDATION_METRIC_NAMES] return v + @classmethod + def _is_secret_key(cls, key: str) -> bool: + """Return True if *key* matches any pattern in _SECRET_KEY_PATTERNS.""" + return any(pattern in key for pattern in cls._SECRET_KEY_PATTERNS) + + @classmethod + def _redact_dict(cls, params: dict) -> dict: + """Return a copy of *params* with secret values replaced by ``***``.""" + return {k: "***" if cls._is_secret_key(k) else v for k, v in params.items()} + @field_serializer("model_list") @classmethod def _redact_model_list(cls, deployments: list[ModelDeployment]) -> list[dict]: @@ -548,10 +558,7 @@ def _redact_model_list(cls, deployments: list[ModelDeployment]) -> list[dict]: for deployment in deployments: deployment = copy.deepcopy(deployment) if "litellm_params" in deployment: - params = deployment["litellm_params"] - for key in params: - if "key" in key or "credentials" in key: - params[key] = "***" + deployment["litellm_params"] = cls._redact_dict(deployment["litellm_params"]) redacted.append(deployment) return redacted @@ -562,9 +569,7 @@ def _redact_model_params(cls, model: ModelConfigUnion) -> dict: data = model.model_dump(mode="json") for field_name, value in data.items(): if field_name.endswith("_params") and isinstance(value, dict): - for key in value: - if "key" in key or "credentials" in key: - value[key] = "***" + data[field_name] = cls._redact_dict(value) return data def apply_env_overrides(self, live: "RunConfig") -> None: @@ -649,19 +654,16 @@ def apply_env_overrides(self, live: "RunConfig") -> None: saved_params[key] = live_params[key] # ── Log resolved configuration ── - def _safe_params(p: dict) -> dict: - return {k: "***" if any(s in k for s in self._SECRET_KEY_PATTERNS) else v for k, v in p.items()} - for params_field, provider_field in self._PARAMS_TO_PROVIDER.items(): params = getattr(self.model, params_field, None) provider = getattr(self.model, provider_field, None) if isinstance(params, dict) and params: - logger.info(f"Resolved {provider_field} ({provider}): {_safe_params(params)}") + logger.info(f"Resolved {provider_field} ({provider}): {self._redact_dict(params)}") for deployment in self.model_list: name = deployment.get("model_name", "?") params = deployment.get("litellm_params", {}) - logger.info(f"Resolved deployment {name}: {_safe_params(params)}") + logger.info(f"Resolved deployment {name}: {self._redact_dict(params)}") @classmethod def from_yaml(cls, path: Path | str) -> "RunConfig": From 86adbc36692bf90f1b396dc31f9bff38253d0694 Mon Sep 17 00:00:00 2001 From: "joseph.marinier" Date: Tue, 31 Mar 2026 16:08:11 -0400 Subject: [PATCH 18/25] Explain `run_id` default value in `eva --help` --- src/eva/models/config.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/src/eva/models/config.py b/src/eva/models/config.py index a5022341..6022829b 100644 --- a/src/eva/models/config.py +++ b/src/eva/models/config.py @@ -39,10 +39,6 @@ logger = logging.getLogger(__name__) -def current_date_and_time(): - return f"{datetime.now(UTC):%Y-%m-%d_%H-%M-%S.%f}" - - def _param_alias(params: dict[str, Any]) -> str: """Return the display alias from a params dict.""" return params.get("alias") or params.get("model") or "" @@ -329,7 +325,7 @@ class ModelDeployment(DeploymentTypedDict): # Run identifier run_id: str = Field( - default_factory=current_date_and_time, + "timestamp and model name(s)", # Overwritten by _set_default_run_id() description="Run identifier, auto-generated if not provided", ) @@ -498,12 +494,13 @@ def _check_companion_services(self) -> "RunConfig": if not self.model.tts: raise ValueError("EVA_MODEL__TTS is required when using EVA_MODEL__AUDIO_LLM (SpeechLM-TTS pipeline).") self._validate_service_params("TTS", self.model.tts, self.model.tts_params) + return self - # Append model names to auto-generated run_id + @model_validator(mode="after") + def _set_default_run_id(self) -> "RunConfig": if "run_id" not in self.model_fields_set: suffix = "_".join(v for v in self.model.pipeline_parts.values() if v) - self.run_id = f"{self.run_id}_{suffix}" - + self.run_id = f"{datetime.now(UTC):%Y-%m-%d_%H-%M-%S.%f}_{suffix}" return self @classmethod From 96674b9253319f7ba128ebe5b178e0b67d317111 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gabrielle=20Gauthier=20Melanc=CC=A7on?= Date: Wed, 1 Apr 2026 17:23:52 -0400 Subject: [PATCH 19/25] Address feebdack --- src/eva/models/config.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/eva/models/config.py b/src/eva/models/config.py index 6022829b..c37675c9 100644 --- a/src/eva/models/config.py +++ b/src/eva/models/config.py @@ -41,7 +41,7 @@ def _param_alias(params: dict[str, Any]) -> str: """Return the display alias from a params dict.""" - return params.get("alias") or params.get("model") or "" + return params.get("alias") or params["model"] class PipelineConfig(BaseModel): @@ -82,9 +82,9 @@ class PipelineConfig(BaseModel): def pipeline_parts(self) -> dict[str, str]: """Component names for this pipeline.""" return { - "stt": _param_alias(self.stt_params) or self.stt or "", + "stt": _param_alias(self.stt_params) or self.stt, "llm": self.llm, - "tts": _param_alias(self.tts_params) or self.tts or "", + "tts": _param_alias(self.tts_params) or self.tts, } @model_validator(mode="before") @@ -142,7 +142,7 @@ def pipeline_parts(self) -> dict[str, str]: """Component names for this pipeline.""" return { "audio_llm": _param_alias(self.audio_llm_params) or self.audio_llm, - "tts": _param_alias(self.tts_params) or self.tts or "", + "tts": _param_alias(self.tts_params) or self.tts, } @@ -591,14 +591,6 @@ def apply_env_overrides(self, live: "RunConfig") -> None: continue if has_redacted: - saved_provider = getattr(self.model, provider_field, None) - live_provider = getattr(live.model, provider_field, None) - if saved_provider != live_provider: - raise ValueError( - f"Cannot restore secrets: saved {provider_field}={saved_provider!r} " - f"but current environment has {provider_field}={live_provider!r}" - ) - saved_alias = saved.get("alias") live_alias = source.get("alias") if saved_alias and live_alias and saved_alias != live_alias: @@ -607,6 +599,14 @@ def apply_env_overrides(self, live: "RunConfig") -> None: f"but current environment has {params_field}[alias]={live_alias!r}" ) + saved_provider = getattr(self.model, provider_field, None) + live_provider = getattr(live.model, provider_field, None) + if saved_provider != live_provider: + logger.warning( + f"Provider mismatch for {params_field}: saved {saved_provider!r}, " + f"current environment has {live_provider!r}" + ) + saved_model = saved.get("model") live_model = source.get("model") if saved_model and live_model and saved_model != live_model: From 5facc71b41bfcbec51f7e27d7b124cc9d35da843 Mon Sep 17 00:00:00 2001 From: JosephMarinier <8386369+JosephMarinier@users.noreply.github.com> Date: Tue, 7 Apr 2026 20:12:20 +0000 Subject: [PATCH 20/25] Apply pre-commit --- src/eva/models/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/eva/models/config.py b/src/eva/models/config.py index 6f98a029..b69a2beb 100644 --- a/src/eva/models/config.py +++ b/src/eva/models/config.py @@ -121,7 +121,7 @@ class SpeechToSpeechConfig(BaseModel): "Set via EVA_MODEL__TURN_STRATEGY=external." ), ) - + @property def pipeline_parts(self) -> dict[str, str]: """Component names for this pipeline.""" From a4caf5d5365ae8132312412cf5b46d790ae200fb Mon Sep 17 00:00:00 2001 From: "joseph.marinier" Date: Tue, 7 Apr 2026 19:15:52 -0400 Subject: [PATCH 21/25] Adapt test_apply_env_overrides_provider_mismatch --- tests/unit/models/test_config_models.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/unit/models/test_config_models.py b/tests/unit/models/test_config_models.py index 99424e22..110d9ff4 100644 --- a/tests/unit/models/test_config_models.py +++ b/tests/unit/models/test_config_models.py @@ -214,8 +214,8 @@ def test_apply_env_overrides(self): assert loaded.model_list[2]["litellm_params"]["aws_access_key_id"] == "must_be_redacted" assert loaded.model_list[2]["litellm_params"]["aws_secret_access_key"] == "must_be_redacted" - def test_apply_env_overrides_provider_mismatch(self): - """Restoring secrets fails if the STT/TTS provider changed.""" + def test_apply_env_overrides_provider_mismatch(self, caplog): + """Restoring secrets warns (but succeeds) if the STT/TTS provider changed.""" config = _config(env_vars=_BASE_ENV) dumped_json = config.model_dump_json() loaded = RunConfig.model_validate_json(dumped_json) @@ -227,8 +227,9 @@ def test_apply_env_overrides_provider_mismatch(self): "EVA_MODEL__STT_PARAMS": json.dumps({"api_key": "k", "model": "whisper-1"}), } ) - with pytest.raises(ValueError, match=r"saved stt='deepgram'.*current environment has stt='openai_whisper'"): + with caplog.at_level("WARNING", logger="eva.models.config"): loaded.apply_env_overrides(live) + assert "saved 'deepgram', current environment has 'openai_whisper'" in caplog.text def test_apply_env_overrides_alias_mismatch(self): """Restoring secrets fails if the alias changed.""" From c30bf0796de9fb4d35b8224d6a97b6ec89f5422f Mon Sep 17 00:00:00 2001 From: "joseph.marinier" Date: Tue, 7 Apr 2026 19:16:21 -0400 Subject: [PATCH 22/25] Run tests in all PRs, no matter the branch --- .github/workflows/tests.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index 2d00e8ff..ad62d66d 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -3,7 +3,6 @@ name: Tests on: merge_group: pull_request: - branches: [main] jobs: test: From cc864d474926d52f87bfcac01fa701f829c097d5 Mon Sep 17 00:00:00 2001 From: "joseph.marinier" Date: Wed, 8 Apr 2026 11:32:50 -0400 Subject: [PATCH 23/25] Remove Markdown alerts inside details as they don't work in GitHub. --- README.md | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 0ba0d575..848c3784 100644 --- a/README.md +++ b/README.md @@ -30,8 +30,7 @@ Agents that score well on task completion tend to score worse on conversational We recommend using [uv](https://docs.astral.sh/uv/) for fast, reliable dependency management. If you don't have `uv` installed, see the [uv installation guide](https://docs.astral.sh/uv/getting-started/installation/). -> [!NOTE] -> This project requires **Python 3.11–3.13** (set via `requires-python` in `pyproject.toml`). `uv` will automatically select a compatible version. If you're using pip, make sure you're running a supported Python version. +This project requires **Python 3.11–3.13** (set via `requires-python` in `pyproject.toml`). `uv` will automatically select a compatible version. If you're using pip, make sure you're running a supported Python version. ```bash # Clone the repository @@ -46,18 +45,16 @@ cp .env.example .env # Edit .env with your API keys (ELEVENLABS_API_KEY, OPENAI_API_KEY required) ``` -> [!TIP] -> After installation, you can run EVA using either: -> - `eva` — CLI entry point (e.g., `eva --help`) -> - `python main.py` — script at the repo root (e.g., `python main.py --help`) -> -> If using an IDE, point your Python interpreter to `.venv/bin/python` so commands run in the virtual environment automatically. Otherwise, prefix commands with `uv run` or activate the environment with `source .venv/bin/activate`. +After installation, you can run EVA using either: +- `eva` — CLI entry point (e.g., `eva --help`) +- `python main.py` — script at the repo root (e.g., `python main.py --help`) + +If using an IDE, point your Python interpreter to `.venv/bin/python` so commands run in the virtual environment automatically. Otherwise, prefix commands with `uv run` or activate the environment with `source .venv/bin/activate`.
Alternative: using pip -> [!NOTE] -> This project requires Python 3.11. If you need to manage multiple Python versions, consider using [pyenv](https://github.com/pyenv/pyenv). +This project requires Python 3.11. If you need to manage multiple Python versions, consider using [pyenv](https://github.com/pyenv/pyenv). ```bash # Create and activate a virtual environment From 3e2cc30cdf9889e51ab7f43202066821c27fb2a3 Mon Sep 17 00:00:00 2001 From: "joseph.marinier" Date: Wed, 8 Apr 2026 11:33:54 -0400 Subject: [PATCH 24/25] Document cloning latest tag --- README.md | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 848c3784..2e492373 100644 --- a/README.md +++ b/README.md @@ -26,6 +26,18 @@ Agents that score well on task completion tend to score worse on conversational

Quick Start

+### Cloning the Repository + +If you're only interested in running the latest stable version of EVA, you can clone with `--branch latest`, and optionally speed things up with `--depth 1 --no-tags --single-branch`. +```bash +git clone https://github.com/ServiceNow/eva.git --branch latest --depth 1 --no-tags --single-branch +``` + +Otherwise, for development, you can clone the default branch, `main`. +```bash +git clone https://github.com/ServiceNow/eva.git +``` + ### Installation We recommend using [uv](https://docs.astral.sh/uv/) for fast, reliable dependency management. If you don't have `uv` installed, see the [uv installation guide](https://docs.astral.sh/uv/getting-started/installation/). @@ -33,8 +45,6 @@ We recommend using [uv](https://docs.astral.sh/uv/) for fast, reliable dependenc This project requires **Python 3.11–3.13** (set via `requires-python` in `pyproject.toml`). `uv` will automatically select a compatible version. If you're using pip, make sure you're running a supported Python version. ```bash -# Clone the repository -git clone https://github.com/ServiceNow/eva.git cd eva # Install all dependencies (uv automatically creates a virtual environment) From 6d771db15259996e20ae06bcd69274aca24d7a91 Mon Sep 17 00:00:00 2001 From: Katrina Date: Wed, 8 Apr 2026 12:19:15 -0400 Subject: [PATCH 25/25] make model required for all services --- src/eva/assistant/pipeline/services.py | 22 +++++--- src/eva/models/config.py | 11 ++-- tests/unit/models/test_config_models.py | 67 +++++++++++++------------ 3 files changed, 53 insertions(+), 47 deletions(-) diff --git a/src/eva/assistant/pipeline/services.py b/src/eva/assistant/pipeline/services.py index 1b735824..c8ee3eff 100644 --- a/src/eva/assistant/pipeline/services.py +++ b/src/eva/assistant/pipeline/services.py @@ -412,19 +412,22 @@ def create_realtime_llm_service( if model_lower.startswith("openai"): session_properties = get_openai_session_properties(system_prompt, params, pipecat_tools) if audit_log is not None: - logger.info( - f"Using InstrumentedRealtimeLLMService for audit log interception: openai: {params.get('model')}" - ) + logger.info(f"Using InstrumentedRealtimeLLMService for audit log interception: openai: {params['model']}") return InstrumentedRealtimeLLMService( - model=params.get("model"), + settings=OpenAIRealtimeLLMService.Settings( + model=params["model"], + session_properties=session_properties, + ), audit_log=audit_log, api_key=params["api_key"], - session_properties=session_properties, ) return OpenAIRealtimeLLMService( api_key=params["api_key"], - session_properties=session_properties, + settings=OpenAIRealtimeLLMService.Settings( + model=params["model"], + session_properties=session_properties, + ), ) elif model_lower.startswith("azure") or model_lower.startswith("gpt-realtime"): # @@ -438,17 +441,21 @@ def create_realtime_llm_service( if audit_log is not None: logger.info("Using InstrumentedRealtimeLLMService for audit log interception") service = InstrumentedRealtimeLLMService( - model=params.get("model"), audit_log=audit_log, api_key=params["api_key"], base_url=url, session_properties=session_properties, + settings=OpenAIRealtimeLLMService.Settings( + model=params["model"], + session_properties=session_properties, + ), ) InstrumentedRealtimeLLMService._connect = override__connect # azure realtime connect return service return OpenAIRealtimeLLMService( api_key=params["api_key"], + model=params["model"], base_url=url, session_properties=session_properties, ) @@ -461,6 +468,7 @@ def create_realtime_llm_service( temperature=0.3, max_duration=datetime.timedelta(minutes=6), voice=params.get("voice", "03e20d03-35e4-43c4-bb18-9b18a2cd3086"), + model=params["model"], ), one_shot_selected_tools=pipecat_tools, ) diff --git a/src/eva/models/config.py b/src/eva/models/config.py index b69a2beb..e08783bd 100644 --- a/src/eva/models/config.py +++ b/src/eva/models/config.py @@ -82,9 +82,9 @@ class PipelineConfig(BaseModel): def pipeline_parts(self) -> dict[str, str]: """Component names for this pipeline.""" return { - "stt": _param_alias(self.stt_params) or self.stt, + "stt": _param_alias(self.stt_params), "llm": self.llm, - "tts": _param_alias(self.tts_params) or self.tts, + "tts": _param_alias(self.tts_params), } @model_validator(mode="before") @@ -307,9 +307,6 @@ class RunConfig(BaseSettings): "EVA_METRICS_TO_RUN": "EVA_METRICS", } - # Providers that manage their own model/key resolution (e.g. WebSocket-based) - _SKIP_PARAMS_VALIDATION: ClassVar[set[str]] = {"nvidia"} - # Maps *_params field names to their provider field for env override logic _PARAMS_TO_PROVIDER: ClassVar[dict[str, str]] = { "stt_params": "stt", @@ -503,7 +500,7 @@ def _check_companion_services(self) -> "RunConfig": self._validate_service_params("audio_llm", self.model.audio_llm, required_keys, self.model.audio_llm_params) elif isinstance(self.model, SpeechToSpeechConfig): # api_key is required, some s2s services don't require model - self._validate_service_params("S2S", self.model.s2s, ["api_key"], self.model.s2s_params) + self._validate_service_params("S2S", self.model.s2s, required_keys, self.model.s2s_params) return self @model_validator(mode="after") @@ -518,8 +515,6 @@ def _validate_service_params( cls, service: str, provider: str, required_keys: list[str], params: dict[str, Any] ) -> None: """Validate that STT/TTS params contain required keys.""" - if provider.lower() in cls._SKIP_PARAMS_VALIDATION: - return missing = [key for key in required_keys if key not in params] if missing: missing_str = " and ".join(f'"{k}"' for k in missing) diff --git a/tests/unit/models/test_config_models.py b/tests/unit/models/test_config_models.py index 99424e22..50f22c73 100644 --- a/tests/unit/models/test_config_models.py +++ b/tests/unit/models/test_config_models.py @@ -57,7 +57,7 @@ } _S2S_ENV = _EVA_MODEL_LIST_ENV | { "EVA_MODEL__S2S": "gpt-realtime-mini", - "EVA_MODEL__S2S_PARAMS": json.dumps({"api_key": ""}), + "EVA_MODEL__S2S_PARAMS": json.dumps({"api_key": "", "model": "test"}), } @@ -77,6 +77,12 @@ def _config( return RunConfig(_env_file=env_file, _cli_parse_args=cli_args, **kwargs) +def _load_json_into_runconfig(json_str: str) -> RunConfig: + """Load RunConfig from JSON with isolated environment (no real env vars).""" + with patch.dict(os.environ, {}, clear=True): + return RunConfig.model_validate_json(json_str) + + class TestRunConfig: def test_create_minimal_config(self): """Test creating a minimal RunConfig.""" @@ -193,7 +199,7 @@ def test_apply_env_overrides(self): """Redacted secrets are restored from a live config for both model and model_list.""" config = _config(env_vars=_BASE_ENV) dumped_json = config.model_dump_json() - loaded = RunConfig.model_validate_json(dumped_json) + loaded = _load_json_into_runconfig(dumped_json) # Everything is redacted after round-trip assert loaded.model.stt_params["api_key"] == "***" @@ -214,11 +220,11 @@ def test_apply_env_overrides(self): assert loaded.model_list[2]["litellm_params"]["aws_access_key_id"] == "must_be_redacted" assert loaded.model_list[2]["litellm_params"]["aws_secret_access_key"] == "must_be_redacted" - def test_apply_env_overrides_provider_mismatch(self): - """Restoring secrets fails if the STT/TTS provider changed.""" + def test_apply_env_overrides_provider_mismatch(self, caplog): + """Restoring secrets warns (but succeeds) if the STT/TTS provider changed.""" config = _config(env_vars=_BASE_ENV) dumped_json = config.model_dump_json() - loaded = RunConfig.model_validate_json(dumped_json) + loaded = _load_json_into_runconfig(dumped_json) live = _config( env_vars=_BASE_ENV @@ -227,8 +233,11 @@ def test_apply_env_overrides_provider_mismatch(self): "EVA_MODEL__STT_PARAMS": json.dumps({"api_key": "k", "model": "whisper-1"}), } ) - with pytest.raises(ValueError, match=r"saved stt='deepgram'.*current environment has stt='openai_whisper'"): + with caplog.at_level("WARNING", logger="eva.models.config"): loaded.apply_env_overrides(live) + assert "Provider mismatch for stt_params" in caplog.text + assert "deepgram" in caplog.text + assert "openai_whisper" in caplog.text def test_apply_env_overrides_alias_mismatch(self): """Restoring secrets fails if the alias changed.""" @@ -239,7 +248,7 @@ def test_apply_env_overrides_alias_mismatch(self): } ) dumped_json = config.model_dump_json() - loaded = RunConfig.model_validate_json(dumped_json) + loaded = _load_json_into_runconfig(dumped_json) live = _config( env_vars=_BASE_ENV @@ -257,7 +266,7 @@ def test_apply_env_overrides_model_mismatch_warns(self, caplog): """Restoring secrets warns (but succeeds) if the STT/TTS model changed.""" config = _config(env_vars=_BASE_ENV) dumped_json = config.model_dump_json() - loaded = RunConfig.model_validate_json(dumped_json) + loaded = _load_json_into_runconfig(dumped_json) live = _config(env_vars=_BASE_ENV | {"EVA_MODEL__TTS_PARAMS": json.dumps({"api_key": "k", "model": "sonic-2"})}) with caplog.at_level("WARNING", logger="eva.models.config"): @@ -273,7 +282,7 @@ def test_apply_env_overrides_url_from_env(self, caplog): } config = _config(env_vars=saved_env) dumped_json = config.model_dump_json() - loaded = RunConfig.model_validate_json(dumped_json) + loaded = _load_json_into_runconfig(dumped_json) # Live env has a different url live_env = _BASE_ENV | { @@ -292,7 +301,7 @@ def test_apply_env_overrides_url_added_from_env(self): """Url from live env is added even if the saved config didn't have one.""" config = _config(env_vars=_BASE_ENV) dumped_json = config.model_dump_json() - loaded = RunConfig.model_validate_json(dumped_json) + loaded = _load_json_into_runconfig(dumped_json) live_env = _BASE_ENV | { "EVA_MODEL__STT_PARAMS": json.dumps({"api_key": "k", "model": "nova-2", "url": "wss://new-host/stt"}), @@ -306,7 +315,7 @@ def test_apply_env_overrides_llm_deployment_mismatch(self): """Restoring secrets fails if a saved LLM deployment is missing from the live model_list.""" config = _config(env_vars=_BASE_ENV) dumped_json = config.model_dump_json() - loaded = RunConfig.model_validate_json(dumped_json) + loaded = _load_json_into_runconfig(dumped_json) # Live config has a different model_list (only one deployment, different name) different_model_list = [ @@ -447,20 +456,6 @@ def test_missing_stt_tts_params(self): } ) - def test_nvidia_stt_skips_params_validation(self): - """NVIDIA STT skips api_key/model validation (uses url-based config).""" - config = _config( - env_vars=_EVA_MODEL_LIST_ENV - | { - "EVA_MODEL__LLM": "gpt-5.2", - "EVA_MODEL__STT": "nvidia", - "EVA_MODEL__TTS": "cartesia", - "EVA_MODEL__STT_PARAMS": json.dumps({"url": "ws://localhost:8000"}), - "EVA_MODEL__TTS_PARAMS": json.dumps({"api_key": "k", "model": "sonic"}), - } - ) - assert config.model.stt == "nvidia" - class TestDefaults: """Verify default values match expectations.""" @@ -547,14 +542,14 @@ class TestDeprecatedEnvVars: _S2S_ENV, "REALTIME_MODEL_PARAMS", "EVA_MODEL__S2S_PARAMS", - {"api_key": "k"}, + {"api_key": "k", "model": "model"}, lambda c: c.model.s2s_params, ), ( _S2S_ENV, "EVA_MODEL__REALTIME_MODEL_PARAMS", "EVA_MODEL__S2S_PARAMS", - {"api_key": "k"}, + {"api_key": "k", "model": "model"}, lambda c: c.model.s2s_params, ), ( @@ -816,7 +811,7 @@ def test_s2s_config_from_env(self): env_vars=_EVA_MODEL_LIST_ENV | { "EVA_MODEL__S2S": "gpt-realtime-mini", - "EVA_MODEL__S2S_PARAMS": json.dumps({"api_key": ""}), + "EVA_MODEL__S2S_PARAMS": json.dumps({"api_key": "", "model": "gpt-realtime-mini"}), } ) assert isinstance(config.model, SpeechToSpeechConfig) @@ -826,17 +821,25 @@ def test_s2s_config_from_cli(self): """--s2s-model selects SpeechToSpeechConfig.""" config = _config( env_vars=_EVA_MODEL_LIST_ENV, - cli_args=["--model.s2s", "gemini_live", "--model.s2s-params", '{"api_key": "test-key"}'], + cli_args=[ + "--model.s2s", + "gemini_live", + "--model.s2s-params", + '{"api_key": "test-key", "model": "gemini_live"}', + ], ) assert isinstance(config.model, SpeechToSpeechConfig) assert config.model.s2s == "gemini_live" - assert config.model.s2s_params == {"api_key": "test-key"} + assert config.model.s2s_params == {"api_key": "test-key", "model": "gemini_live"} def test_s2s_config_with_params(self): """S2S params are passed through.""" config = _config( env_vars=_EVA_MODEL_LIST_ENV, - model={"s2s": "gpt-realtime-mini", "s2s_params": {"voice": "alloy", "api_key": "key_1"}}, + model={ + "s2s": "gpt-realtime-mini", + "s2s_params": {"voice": "alloy", "api_key": "key_1", "model": "gpt-realtime-mini"}, + }, ) assert isinstance(config.model, SpeechToSpeechConfig) - assert config.model.s2s_params == {"voice": "alloy", "api_key": "key_1"} + assert config.model.s2s_params == {"voice": "alloy", "api_key": "key_1", "model": "gpt-realtime-mini"}