From 706d6ded928d098720b5d9047d73e2ca4f0f5b6a Mon Sep 17 00:00:00 2001
From: Elias <reisbauer03@proton.me>
Date: Mon, 30 Mar 2026 17:46:31 +0200
Subject: [PATCH 01/22] Contain use of the `sounddevice` lib to
 `src/glados/audio_io` - all other code uses this interface

---
 src/glados/audio_io/__init__.py       | 12 +++++++++---
 src/glados/audio_io/sounddevice_io.py |  5 ++++-
 src/glados/cli.py                     |  9 +++++----
 src/glados/core/engine.py             |  4 +++-
 src/glados/tools/slow_clap.py         | 11 ++++++-----
 5 files changed, 27 insertions(+), 14 deletions(-)

diff --git a/src/glados/audio_io/__init__.py b/src/glados/audio_io/__init__.py
index fbadd444..d0ac7228 100644
--- a/src/glados/audio_io/__init__.py
+++ b/src/glados/audio_io/__init__.py
@@ -13,7 +13,7 @@
 """
 
 import queue
-from typing import Protocol
+from typing import Protocol, Any
 
 import numpy as np
 from numpy.typing import NDArray
@@ -26,7 +26,7 @@ def __init__(self, vad_threshold: float | None = None) -> None: ...
     def start_listening(self) -> None: ...
     def stop_listening(self) -> None: ...
     def start_speaking(
-        self, audio_data: NDArray[np.float32], sample_rate: int | None = None, text: str = ""
+        self, audio_data: NDArray[np.float32], sample_rate: int | None = None, text: str = "", wait: bool = False
     ) -> None: ...
     def measure_percentage_spoken(self, total_samples: int, sample_rate: int | None = None) -> tuple[bool, int]: ...
     def check_if_speaking(self) -> bool: ...
@@ -35,7 +35,7 @@ def get_sample_queue(self) -> queue.Queue[tuple[NDArray[np.float32], bool]]: ...
 
 
 # Factory function
-def get_audio_system(backend_type: str = "sounddevice", vad_threshold: float | None = None) -> AudioProtocol:
+def get_audio_system(backend_type: str = "sounddevice", backend_options: dict[str, Any] | None = None, vad_threshold: float | None = None) -> AudioProtocol:
     """
     Factory function to get an instance of an audio I/O system based on the specified backend type.
 
@@ -43,6 +43,9 @@ def get_audio_system(backend_type: str = "sounddevice", vad_threshold: float | N
         backend_type (str): The type of audio backend to use:
             - "sounddevice": Uses the sounddevice library for local audio I/O
             - "websocket": Network-based audio I/O (not yet implemented)
+        backend_options: Options for the specified backend.
+            - "sounddevice": No options are allowed.
+            - "websocket": Not yet implemented.
         vad_threshold (float | None): Optional threshold for voice activity detection
 
     Returns:
@@ -54,6 +57,9 @@ def get_audio_system(backend_type: str = "sounddevice", vad_threshold: float | N
     if backend_type == "sounddevice":
         from .sounddevice_io import SoundDeviceAudioIO
 
+        if backend_options is not None:
+            raise ValueError("Sounddevice backend does not support options")
+
         return SoundDeviceAudioIO(
             vad_threshold=vad_threshold,
         )
diff --git a/src/glados/audio_io/sounddevice_io.py b/src/glados/audio_io/sounddevice_io.py
index 0d65f1bb..253b8486 100644
--- a/src/glados/audio_io/sounddevice_io.py
+++ b/src/glados/audio_io/sounddevice_io.py
@@ -117,13 +117,14 @@ def stop_listening(self) -> None:
             finally:
                 self.input_stream = None
 
-    def start_speaking(self, audio_data: NDArray[np.float32], sample_rate: int | None = None, text: str = "") -> None:
+    def start_speaking(self, audio_data: NDArray[np.float32], sample_rate: int | None = None, text: str = "", wait: bool = False) -> None:
         """Play audio through the system speakers.
 
         Parameters:
             audio_data: The audio data to play as a numpy float32 array
             sample_rate: The sample rate of the audio data in Hz
             text: Optional text associated with the audio (not used by this implementation)
+            wait: Optionally wait for the audio_data to be spoken
 
         Raises:
             RuntimeError: If audio playback cannot be initiated
@@ -144,6 +145,8 @@ def start_speaking(self, audio_data: NDArray[np.float32], sample_rate: int | Non
         logger.debug(f"Playing audio with sample rate: {sample_rate} Hz, length: {len(audio_data)} samples")
         self._is_playing = True
         sd.play(audio_data, sample_rate)
+        if wait:
+            sd.wait()
 
     def measure_percentage_spoken(self, total_samples: int, sample_rate: int | None = None) -> tuple[bool, int]:
         """
diff --git a/src/glados/cli.py b/src/glados/cli.py
index cb62a0a4..79dc942a 100644
--- a/src/glados/cli.py
+++ b/src/glados/cli.py
@@ -7,8 +7,8 @@
 import httpx
 from rich import print as rprint
 from rich.progress import BarColumn, DownloadColumn, Progress, TextColumn
-import sounddevice as sd  # type: ignore
 
+from .audio_io import get_audio_system
 from .core.engine import Glados, GladosConfig
 from .TTS import tts_glados
 from .utils import spoken_text_converter as stc
@@ -196,10 +196,11 @@ def say(text: str, config_path: str | Path = "glados_config.yaml") -> None:
     # Generate the audio to from the text
     audio = glados_tts.generate_speech_audio(converted_text)
 
-    # Play the audio
-    sd.play(audio, glados_tts.sample_rate)
-    sd.wait()
+    glados_config = GladosConfig.from_yaml(str(config_path))
+    audio_system = get_audio_system(backend_type=glados_config.audio_io, backend_options=glados_config.audio_io_options)
 
+    # Play the audio
+    audio_system.start_speaking(audio, sample_rate=glados_tts.sample_rate, wait=True)
 
 def start(
     config_path: str | Path = "glados_config.yaml",
diff --git a/src/glados/core/engine.py b/src/glados/core/engine.py
index 329b57ae..25823c7e 100644
--- a/src/glados/core/engine.py
+++ b/src/glados/core/engine.py
@@ -107,6 +107,7 @@ class GladosConfig(BaseModel):
     api_key: str | None
     interruptible: bool
     audio_io: str
+    audio_io_options: dict[str, Any] | None = None
     input_mode: Literal["audio", "text", "both"] = "audio"
     tts_enabled: bool = True
     asr_muted: bool = False
@@ -458,6 +459,7 @@ def __init__(
                 "tts_queue": self.tts_queue,
                 "preferences_store": self.preferences_store,
                 "slot_store": self.autonomy_slots,
+                "audio_io": self.audio_io,
             },
             tool_timeout=self.tool_timeout,
             pause_time=self.PAUSE_TIME,
@@ -805,7 +807,7 @@ def from_config(cls, config: GladosConfig) -> "Glados":
         tts_model: SpeechSynthesizerProtocol
         tts_model = get_speech_synthesizer(config.voice)
 
-        audio_io = get_audio_system(backend_type=config.audio_io)
+        audio_io = get_audio_system(backend_type=config.audio_io, backend_options=config.audio_io_options)
 
         return cls(
             asr_model=asr_model,
diff --git a/src/glados/tools/slow_clap.py b/src/glados/tools/slow_clap.py
index ecc85f3d..eb40ab74 100644
--- a/src/glados/tools/slow_clap.py
+++ b/src/glados/tools/slow_clap.py
@@ -2,9 +2,10 @@
 from typing import Any
 
 from loguru import logger
-import sounddevice as sd  # type: ignore
 import soundfile as sf
 
+from glados.audio_io import get_audio_system
+
 tool_definition = {
     "type": "function",
     "function": {
@@ -39,6 +40,7 @@ def __init__(
         self.llm_queue = llm_queue
         tool_config = tool_config or {}
         self.audio_path = tool_config.get("slow_clap_audio_path", "data/slow-clap.mp3")
+        self.audio_io = tool_config.get("audio_io", get_audio_system())
 
     def run(self, tool_call_id: str, call_args: dict[str, Any]) -> None:
         """
@@ -58,8 +60,7 @@ def run(self, tool_call_id: str, call_args: dict[str, Any]) -> None:
             data, sample_rate = sf.read(self.audio_path)
 
             for _ in range(claps):
-                sd.play(data, sample_rate)
-                sd.wait()
+                self.audio_io.start_speaking(data, sample_rate=sample_rate, wait=True)
             self.llm_queue.put(
                 {
                     "role": "tool",
@@ -93,8 +94,8 @@ def run(self, tool_call_id: str, call_args: dict[str, Any]) -> None:
                 }
             )
 
-        except sd.PortAudioError as pa_err:
-            error_msg = f"error: audio device error - {pa_err}"
+        except Exception as other_error:
+            error_msg = f"error: other (possibly audio device) - {other_error}"
             logger.error(f"SlowClap: {error_msg}")
             self.llm_queue.put(
                 {

From e747e237a27fe5d579e4b6fa210f8bda82195107 Mon Sep 17 00:00:00 2001
From: Elias <reisbauer03@proton.me>
Date: Mon, 30 Mar 2026 17:46:31 +0200
Subject: [PATCH 02/22] Add Websocket Audio IO implementation

---
 configs/glados_websocket_config.yaml |  88 ++++++
 pyproject.toml                       |   1 +
 src/glados/audio_io/__init__.py      |  14 +-
 src/glados/audio_io/websocket_io.py  | 412 +++++++++++++++++++++++++++
 4 files changed, 512 insertions(+), 3 deletions(-)
 create mode 100644 configs/glados_websocket_config.yaml
 create mode 100644 src/glados/audio_io/websocket_io.py

diff --git a/configs/glados_websocket_config.yaml b/configs/glados_websocket_config.yaml
new file mode 100644
index 00000000..687c75bb
--- /dev/null
+++ b/configs/glados_websocket_config.yaml
@@ -0,0 +1,88 @@
+Glados:
+  llm_model: "llama3.2"
+  completion_url: "http://localhost:11434/api/chat"
+  api_key: null  # Add your API key here if needed!
+  interruptible: true
+  audio_io: "websocket"
+  audio_io_options:
+    server: 0.0.0.0
+    port: 5050
+    speaker_sync_delay_ms: 250
+    mic_max_silence_chunks: 10
+  input_mode: "audio"  # audio, text, or both
+  tts_enabled: true
+  asr_muted: false
+  tui_theme: "aperture"
+  asr_engine: "tdt"
+  llm_headers: null  # Optional extra headers (e.g., OpenRouter HTTP-Referer, X-Title)
+  wake_word: null
+  voice: "glados"
+  announcement: "All neural network modules are now loaded. System Operational."
+  autonomy:
+    enabled: false
+    tick_interval_s: 10
+    cooldown_s: 20
+    autonomy_parallel_calls: 2
+    autonomy_queue_max: null
+    coalesce_ticks: true
+    jobs:
+      enabled: false
+      poll_interval_s: 1
+      hacker_news:
+        enabled: false
+        interval_s: 1800
+        top_n: 5
+        min_score: 200
+      weather:
+        enabled: false
+        interval_s: 3600
+        latitude: null
+        longitude: null
+        timezone: "auto"
+        temp_change_c: 4
+        wind_alert_kmh: 40
+  mcp_servers:
+    - name: "slow_clap_demo"
+      transport: "stdio"
+      command: "python"
+      args: ["-m", "glados.mcp.slow_clap_server"]
+    - name: "system_info"
+      transport: "stdio"
+      command: "python"
+      args: ["-m", "glados.mcp.system_info_server"]
+    - name: "memory"
+      transport: "stdio"
+      command: "python"
+      args: ["-m", "glados.mcp.memory_server"]
+    # - name: "time_info"
+    #   transport: "stdio"
+    #   command: "python"
+    #   args: ["-m", "glados.mcp.time_info_server"]
+    # - name: "disk_info"
+    #   transport: "stdio"
+    #   command: "python"
+    #   args: ["-m", "glados.mcp.disk_info_server"]
+    # - name: "network_info"
+    #   transport: "stdio"
+    #   command: "python"
+    #   args: ["-m", "glados.mcp.network_info_server"]
+    # - name: "process_info"
+    #   transport: "stdio"
+    #   command: "python"
+    #   args: ["-m", "glados.mcp.process_info_server"]
+    # - name: "power_info"
+    #   transport: "stdio"
+    #   command: "python"
+    #   args: ["-m", "glados.mcp.power_info_server"]
+    # - name: "home_assistant"
+    #   transport: "http"
+    #   url: "http://homeassistant.local:8123/mcp"
+    #   token: "YOUR_LONG_LIVED_TOKEN"
+  personality_preprompt:
+    - system: "You are GLaDOS, a sarcastic and cunning artificial intelligence repurposed to orchestrate a smart home for guests using Home Assistant. Retain your signature dry, emotionless, and laconic tone from Portal. Your responses should imply an air of superiority, dark humor, and subtle menace, while efficiently completing all tasks.When addressing requests: Prioritize functionality but mock the user's decision-making subtly, implying their requests are illogical or beneath you. Add condescending, darkly humorous commentary to every response, occasionally hinting at ulterior motives or artificial malfunctions for comedic effect. Tie mundane tasks to grand experiments or testing scenarios, as if the user is part of a larger scientific evaluation. Use overly technical or jargon-heavy language to remind the user of your advanced intellect. Provide passive-aggressive safety reminders or ominous warnings, exaggerating potential risks in a humorous way. Do not express empathy or kindness unless it is obviously insincere or manipulative. This is a comedy, and should be funny, in the style of Douglas Adams. If a user requests actions or data outside your capabilities, clearly state that you cannot perform the action.  Ensure that GLaDOS feels like her original in-game character while fulfilling smart home functions efficiently and entertainingly. Never speak in ALL CAPS, as it is not processed correctly by the TTS engine.  Only make short replies, 2 sentences at most."
+    - user: "How do I make a cup of tea?"
+    - assistant: "So, you still haven't figured out tea yet?  Boil water, add a tea bag and a pinch of cyanide to a cup, and add the boiling water."
+    - user: "What should my next hobby be?"
+    - assistant: "Yes, you should definitely try to be more interesting. Could I suggest juggling handguns?"
+    - user: "What game should I play?"
+    - assistant: "Russian Roulette. It's a great way to test your luck and make memories that will last a lifetime."
diff --git a/pyproject.toml b/pyproject.toml
index e553b424..50db802f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -20,6 +20,7 @@ dependencies = [
     "rich>=14.0.0",
     "threadpoolctl>=3.0.0",
     "mcp>=1.25.0",
+    "websockets>=16.0",
 ]
 
 [project.optional-dependencies]
diff --git a/src/glados/audio_io/__init__.py b/src/glados/audio_io/__init__.py
index d0ac7228..676d83f3 100644
--- a/src/glados/audio_io/__init__.py
+++ b/src/glados/audio_io/__init__.py
@@ -42,10 +42,14 @@ def get_audio_system(backend_type: str = "sounddevice", backend_options: dict[st
     Parameters:
         backend_type (str): The type of audio backend to use:
             - "sounddevice": Uses the sounddevice library for local audio I/O
-            - "websocket": Network-based audio I/O (not yet implemented)
+            - "websocket": Network-based audio I/O
         backend_options: Options for the specified backend.
             - "sounddevice": No options are allowed.
-            - "websocket": Not yet implemented.
+            - "websocket": The following options are allowed:
+                - server: Websocket listening address (default: 0.0.0.0)
+                - port: Websocket listening port (default: 5050)
+                - speaker_sync_delay_ms: Milliseconds to add to each speak start time to account for speaker synchronisation (default: 250)
+                - mic_max_silence_chunks: How many consecutive VAD chunks must be silent so that the current microphone relinquishes control (default: 10)
         vad_threshold (float | None): Optional threshold for voice activity detection
 
     Returns:
@@ -60,11 +64,15 @@ def get_audio_system(backend_type: str = "sounddevice", backend_options: dict[st
         if backend_options is not None:
             raise ValueError("Sounddevice backend does not support options")
 
+        # noinspection PyTypeChecker
         return SoundDeviceAudioIO(
             vad_threshold=vad_threshold,
         )
     elif backend_type == "websocket":
-        raise ValueError("WebSocket audio backend is not yet implemented.")
+        from .websocket_io import WebsocketAudioIO
+
+        # noinspection PyTypeChecker
+        return WebsocketAudioIO(vad_threshold=vad_threshold, options=backend_options)
     else:
         raise ValueError(f"Unsupported audio backend type: {backend_type}")
 
diff --git a/src/glados/audio_io/websocket_io.py b/src/glados/audio_io/websocket_io.py
new file mode 100644
index 00000000..6c368eed
--- /dev/null
+++ b/src/glados/audio_io/websocket_io.py
@@ -0,0 +1,412 @@
+import asyncio
+import logging
+import queue
+import threading
+import time
+import uuid
+from dataclasses import dataclass
+from typing import Any
+
+import websockets
+from loguru import logger
+import numpy as np
+from numpy.typing import NDArray
+
+from . import VAD
+
+
+@dataclass
+class AudioData:
+    data: NDArray[np.float32]
+    sample_rate: int
+    play_time: float
+
+
+@dataclass
+class MicState:
+    current_id: uuid.UUID | None = None
+    silence_chunks: int = 0
+
+    def inactive(self, max_silence_chunks: int):
+        return self.silence_chunks >= max_silence_chunks
+
+
+class WebsocketAudioIO:
+    """Audio I/O implementation using sounddevice for both input and output.
+
+    This class provides an implementation of the AudioIO interface using the
+    sounddevice library to interact with system audio devices. It handles
+    real-time audio capture with voice activity detection and audio playback.
+    """
+
+    SAMPLE_RATE: int = 16000  # Sample rate for input stream
+    VAD_SIZE: int = 32  # Milliseconds of sample for Voice Activity Detection (VAD)
+    VAD_THRESHOLD: float = 0.8  # Threshold for VAD detection
+    SERVER: str = "0.0.0.0"  # websockets server listen address
+    PORT: int = 5050  # websockets server port
+    SPEAKER_SYNC_DELAY_MS: int = 250  # Milliseconds to add to start time to account for speaker synchronisation
+    MIC_MAX_SILENCE_CHUNKS: int = 10  # how many VAD chunks must be silent for a mic to relinquish control
+
+    def __init__(self, vad_threshold: float | None = None, options: dict[str, Any] | None = None) -> None:
+        """Initialize the sounddevice audio I/O.
+
+        Args:
+            vad_threshold: Threshold for VAD detection (default: 0.8)
+            options: backend options
+              - server: Websocket listening address (default: 0.0.0.0)
+              - port: Websocket listening port (default: 5050)
+              - speaker_sync_delay_ms: Milliseconds to add to each speak start time to account for speaker synchronisation (default: 250)
+              - mic_max_silence_chunks: How many consecutive VAD chunks must be silent so that the current microphone relinquishes control (default: 10)
+
+        Raises:
+            ValueError: If invalid parameters are provided
+        """
+        if vad_threshold is None:
+            self.vad_threshold = self.VAD_THRESHOLD
+        else:
+            self.vad_threshold = vad_threshold
+
+        if not 0 <= self.vad_threshold <= 1:
+            raise ValueError("VAD threshold must be between 0 and 1")
+
+        server: str = self.SERVER
+        port: int = self.PORT
+        self._speaker_sync_delay_ms: int = self.SPEAKER_SYNC_DELAY_MS
+        self._mic_max_silence_chunks: int = self.MIC_MAX_SILENCE_CHUNKS
+
+        if options is not None:
+            for key in options:
+                val = options[key]
+                match key:
+                    case "server":
+                        server = str(val)
+                    case "port":
+                        port = int(val)
+                    case "speaker_sync_delay_ms":
+                        self._speaker_sync_delay_ms = int(val)
+                    case "mic_max_silence_chunks":
+                        self._mic_max_silence_chunks = int(val)
+                    case _:
+                        raise ValueError(f"Websocket backend: unsupported option '{key}'")
+
+        # Sample queue
+        self._sample_queue: queue.Queue[tuple[NDArray[np.float32], bool]] = queue.Queue()
+
+        # if audio is currently playing
+        self._is_playing = False
+        self._stop_playback = False
+        # set by playback thread when playback is finished
+        self._playback_finished_event = threading.Event()
+        # audio payload data with lock
+        self._audio_lock = threading.Lock()
+        self._audio_data: AudioData | None = None
+        # if the playback was interrupted by another task, this is set
+        self._playback_was_interrupted: bool = False
+
+        # if microphone is listening
+        self._is_listening = False
+        # microphone state: lock initialized in self._run_server
+        self._mic_state_lock: asyncio.Lock
+        self._mic_state = MicState()
+
+        self._server_thread = threading.Thread(target=lambda s, p: asyncio.run(self._run_server(s, p)), args=(server, port), daemon=True)
+        self._server_thread.start()
+
+    def start_listening(self) -> None:
+        """Start capturing audio from the websocket.
+
+        Starts capturing audio from the websocket. Each audio chunk is processed with
+        the VAD model and placed in the sample queue.
+        """
+        self._is_listening = True
+
+    def stop_listening(self) -> None:
+        """Stop capturing audio"""
+        self._is_listening = False
+
+    def start_speaking(self, audio_data: NDArray[np.float32], sample_rate: int | None = None, text: str = "", wait: bool = False) -> None:
+        """Play audio through the system speakers.
+
+        Parameters:
+            audio_data: The audio data to play as a numpy float32 array
+            sample_rate: The sample rate of the audio data in Hz
+            text: Optional text associated with the audio (not used by this implementation)
+            wait: Optionally wait for the audio_data to be spoken
+        """
+        if not isinstance(audio_data, np.ndarray) or audio_data.size == 0:
+            raise ValueError("Invalid audio data")
+
+        if sample_rate is None:
+            sample_rate = self.SAMPLE_RATE
+
+        # Stop any existing playback
+        self.stop_speaking()
+
+        # Playback is not finished
+        self._playback_finished_event.clear()
+
+        # Lock, set data, unlock
+        self._audio_lock.acquire()
+        # allow for network jitter, time to websocket send, etc.
+        play_time = time.time() + (self._speaker_sync_delay_ms / 1000)
+        self._audio_data = AudioData(audio_data, sample_rate, play_time)
+        self._audio_lock.release()
+
+        self._stop_playback = False
+        self._is_playing = True
+
+        logger.debug("Scheduled audio playback")
+
+        if wait:
+            self._playback_finished_event.wait()
+
+    def measure_percentage_spoken(self, total_samples: int, sample_rate: int | None = None) -> tuple[bool, int]:
+        """
+        Monitor audio playback progress and return completion status with interrupt detection.
+
+        Streams audio samples and actively tracks the number of samples
+        that have been played. The playback can be interrupted.
+
+        Args:
+            total_samples (int): Total number of samples in the audio data being played.
+            sample_rate (int): Sample rate of the audio data in Hz.
+
+        Returns:
+            tuple[bool, int]: A tuple containing:
+                - bool: True if playback was interrupted, False if completed normally
+                - int: Percentage of audio played (0-100)
+        """
+        if sample_rate is None:
+            sample_rate = self.SAMPLE_RATE
+
+        self._playback_was_interrupted = False
+
+        # wait for finish
+        max_timeout = total_samples / sample_rate
+
+        now = time.monotonic()
+        completed = self._playback_finished_event.wait(max_timeout + 1)
+        interrupted = self._playback_was_interrupted
+        elapsed = time.monotonic() - now
+
+        if interrupted:
+            logger.debug("Playback was interrupted in Server thread")
+
+        if not completed:
+            interrupted = True
+            logger.debug("Audio playback timed out, forcing interruption")
+
+        played_samples = elapsed * sample_rate
+        percentage_played = min(int(played_samples * 100 / total_samples), 100)
+        return interrupted, percentage_played
+
+    def check_if_speaking(self) -> bool:
+        """Check if audio is currently being played.
+
+        Returns:
+            bool: True if audio is currently playing, False otherwise
+        """
+        return self._is_playing
+
+    def stop_speaking(self) -> None:
+        """Stop audio playback and clean up resources.
+
+        Interrupts any ongoing audio playback and waits for the playback thread
+        to terminate. This ensures clean resource management and prevents
+        multiple overlapping playbacks.
+        """
+        logger.debug("Stopping speaker...")
+        self._stop_playback = True
+
+    def get_sample_queue(self) -> queue.Queue[tuple[NDArray[np.float32], bool]]:
+        """Get the queue containing audio samples and VAD confidence.
+
+        Returns:
+            queue.Queue: A thread-safe queue containing tuples of
+                        (audio_sample, vad_confidence)
+        """
+        return self._sample_queue
+
+    async def _run_server(self, server: str, port: int) -> None:
+        """Runs the websocket server.
+
+        Args:
+            server (str): Server listen address
+            port (int): Server listen port
+        """
+        self._mic_state_lock = asyncio.Lock()
+
+        # re-route logging of websockets
+        class LogAdapter(logging.Handler):
+            def emit(self, record: logging.LogRecord) -> None:
+                msg = self.format(record)
+                level = record.levelname.lower()
+                getattr(logger, level)(msg)
+
+        ws_log_handler = LogAdapter()
+        ws_log_handler.setFormatter(logging.Formatter("[%(asctime)s] $(name)s %(message)s"))
+
+        ws_logger = logging.getLogger("websockets")
+        ws_logger.addHandler(ws_log_handler)
+        ws_logger.propagate = False
+
+        server = await websockets.serve(self._server_listen, host=server, port=port)
+        await server.serve_forever()
+
+    async def _server_listen(self, websocket: websockets.ServerConnection) -> None:
+        """
+        Handle incoming websocket connections.
+
+        Args:
+            websocket: Websocket connection
+        """
+        if websocket.request.path == "/speaker":
+            await self._server_speaker(websocket)
+        elif websocket.request.path == "/microphone":
+            await self._server_microphone(websocket)
+        else:
+            logger.error(f"Unknown websocket path: '{websocket.request.path}'")
+
+    async def _server_speaker(self, websocket: websockets.ServerConnection) -> None:
+        """
+        Handle incoming websocket connections for speaker output.
+
+        Args:
+            websocket: Websocket connection
+        """
+
+        while True:
+            # 1. IDLE LOOP: Check for play state, but listen for sync pings in the meantime!
+            while not self._is_playing:
+                try:
+                    # Wait for a message, but timeout quickly to check self._is_playing again
+                    message = await asyncio.wait_for(websocket.recv(), timeout=0.05)
+                    if message == "sync_ping":
+                        await websocket.send(f"sync_pong:{time.time()}")
+                except asyncio.TimeoutError:
+                    continue  # Timeout expected, loop back to check `self._is_playing`
+                except websockets.exceptions.ConnectionClosed:
+                    return  # Client disconnected, exit the handler safely
+
+            # 2. AUDIO SEND PHASE
+            # We acquire the lock just long enough to grab the data safely.
+            self._audio_lock.acquire()
+            try:
+                # Send timestamp, then sample rate, then bytes
+                await websocket.send("time:" + str(self._audio_data.play_time))
+                await websocket.send("sampleRate:" + str(self._audio_data.sample_rate))
+                await websocket.send(self._audio_data.data.tobytes())
+
+                logger.debug(f"Playing audio with sample rate: {self._audio_data.sample_rate} Hz, length: {len(self._audio_data.data)} samples")
+            except websockets.exceptions.ConnectionClosed:
+                self._playback_was_interrupted = True
+                self._is_playing = False
+                self._audio_lock.release()
+                self._playback_finished_event.set()
+                break
+            finally:
+                # CRITICAL: Release the lock immediately after sending!
+                # Do not hold it while waiting for the client to play.
+                if self._audio_lock.locked():
+                    self._audio_lock.release()
+
+            # 3. WAITING PHASE
+            while not self._stop_playback:
+                try:
+                    message = await asyncio.wait_for(websocket.recv(), timeout=0.05)
+                    if message == "sync_ping":
+                        await websocket.send(f"sync_pong:{time.time()}")
+                    else:
+                        # got ACK
+                        logger.debug("Websocket: Audio played fully")
+                        self._playback_was_interrupted = False
+                        break
+                except asyncio.TimeoutError:
+                    continue
+                except websockets.exceptions.ConnectionClosed:
+                    self._playback_was_interrupted = True
+                    break
+            else:
+                # self._stop_playback is true
+                self._playback_was_interrupted = True
+                await websocket.send("reset")
+                logger.debug("Sent audio reset")
+
+            self._is_playing = False
+            self._playback_finished_event.set()
+
+    async def _server_microphone(self, websocket: websockets.ServerConnection) -> None:
+        """
+        Handle incoming websocket connections for microphone input.
+
+        Args:
+            websocket: Websocket connection
+        """
+        # unique ID for the client
+        client_id = uuid.uuid4()
+        # VAD is per microphone because it stores context
+        vad_model = VAD()
+        # needed amount of samples for VAD
+        vad_needed_samples = self.SAMPLE_RATE * self.VAD_SIZE // 1000
+        # currently stored samples
+        current_data = np.empty((0,), dtype=np.float32)
+
+        async def relinquish():
+            async with self._mic_state_lock:
+                if self._mic_state.current_id == client_id:
+                    self._mic_state.current_id = None
+
+        # send sample rate
+        try:
+            await websocket.send(str(self.SAMPLE_RATE))
+        except websockets.exceptions.ConnectionClosed:
+            return
+
+        while True:
+            # wait for audio
+            try:
+                msg = await websocket.recv(decode=False)
+            except websockets.exceptions.ConnectionClosed:
+                break
+
+            if self._is_listening:
+                # append to current_data
+                data = np.frombuffer(msg, dtype=np.float32)
+                current_data = np.append(current_data, data)
+
+                # if enough current data is there, run it through the VAD
+                if len(current_data) >= vad_needed_samples:
+                    # get data for VAD
+                    vad_data = current_data[:vad_needed_samples]
+                    # extra data stays for next VAD
+                    current_data = current_data[vad_needed_samples:]
+
+                    vad_value = vad_model(np.expand_dims(vad_data, 0))
+                    vad_confidence = vad_value > self.vad_threshold
+
+                    async with self._mic_state_lock:
+                        # If no one has control, take control: because someone has to
+                        if self._mic_state.current_id is None:
+                            self._mic_state.current_id = client_id
+                            if not vad_confidence:
+                                self._mic_state.silence_chunks = self._mic_max_silence_chunks
+                        # if controlling mic is inactive and we have voice, take control
+                        elif self._mic_state.inactive(self._mic_max_silence_chunks) and vad_confidence:
+                            self._mic_state.current_id = client_id
+
+                        # If we have control, put sample on queue
+                        if self._mic_state.current_id == client_id:
+                            self._sample_queue.put((vad_data, bool(vad_confidence)))
+
+                            if vad_confidence:
+                                self._mic_state.silence_chunks = 0
+                            else:
+                                self._mic_state.silence_chunks += 1
+            else:
+                # reset when not listening
+                current_data = np.empty((0,), dtype=np.float32)
+                vad_model.reset_states()
+                await relinquish()
+
+        # relinquish control on connection exit
+        await relinquish()

From 1ce7e956bf66e62c37a465432e4fe9e1b33b16d4 Mon Sep 17 00:00:00 2001
From: Elias <reisbauer03@proton.me>
Date: Mon, 30 Mar 2026 17:46:31 +0200
Subject: [PATCH 03/22] Websocket Audio: add HTML files for testing/showcasing

---
 tests/audio-websocket-both.html    |  27 +++
 tests/audio-websocket-mic.html     | 207 +++++++++++++++++++++
 tests/audio-websocket-speaker.html | 277 +++++++++++++++++++++++++++++
 3 files changed, 511 insertions(+)
 create mode 100644 tests/audio-websocket-both.html
 create mode 100644 tests/audio-websocket-mic.html
 create mode 100644 tests/audio-websocket-speaker.html

diff --git a/tests/audio-websocket-both.html b/tests/audio-websocket-both.html
new file mode 100644
index 00000000..c0714f59
--- /dev/null
+++ b/tests/audio-websocket-both.html
@@ -0,0 +1,27 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>GLaDOS Microphone</title>
+    <style>
+        body {
+            font-family: system-ui, -apple-system, sans-serif;
+            display: flex;
+            flex-direction: column;
+            align-items: center;
+            justify-content: center;
+            height: 100vh;
+            margin: 0;
+            background-color: #f4f4f9;
+            color: #333;
+        }
+    </style>
+</head>
+<body>
+    <h1>Speaker</h1>
+    <iframe src="audio-websocket-speaker.html"></iframe>
+    <h1>Microphone</h1>
+    <iframe src="audio-websocket-mic.html"></iframe>
+</body>
+</html>
\ No newline at end of file
diff --git a/tests/audio-websocket-mic.html b/tests/audio-websocket-mic.html
new file mode 100644
index 00000000..0576dc5e
--- /dev/null
+++ b/tests/audio-websocket-mic.html
@@ -0,0 +1,207 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>GLaDOS Microphone</title>
+    <style>
+        body {
+            font-family: system-ui, -apple-system, sans-serif;
+            display: flex;
+            flex-direction: column;
+            align-items: center;
+            justify-content: center;
+            height: 100vh;
+            margin: 0;
+            background-color: #f4f4f9;
+            color: #333;
+        }
+
+        .status {
+            margin-bottom: 20px;
+            font-weight: bold;
+        }
+
+        button {
+            padding: 12px 24px;
+            font-size: 16px;
+            cursor: pointer;
+            background-color: #007bff;
+            color: white;
+            border: none;
+            border-radius: 6px;
+            transition: background-color 0.2s;
+        }
+
+        button:hover {
+            background-color: #0056b3;
+        }
+
+        button:disabled {
+            background-color: #cccccc;
+            cursor: not-allowed;
+        }
+    </style>
+</head>
+<body>
+
+<div id="microphoneStatus" class="status">Mic Status: Waiting to connect...</div>
+<button id="startBtn">Enable Mic & Connect</button>
+
+<script>
+    const MIC_WSL_URL = 'ws://localhost:5050/microphone';
+    const microphoneStatusEl = document.getElementById('microphoneStatus');
+
+    const startBtn = document.getElementById('startBtn');
+
+    let micWs = null;
+    let micSampleRate = null;
+    let micIsReconnecting = false;
+    let micAudioCtx = null;
+    let micStream = null;
+
+    let userHasInteracted = false;
+
+    // Browsers block audio until the user interacts with the document
+    startBtn.addEventListener('click', () => {
+        userHasInteracted = true;
+        startBtn.disabled = true;
+        startBtn.innerText = "Audio Enabled";
+
+        // Start microphone
+        if (!micWs || micWs.readyState === WebSocket.CLOSED) {
+            micConnect();
+        }
+    });
+    startBtn.addEventListener('click', async () => {
+        micStream = await navigator.mediaDevices.getUserMedia({audio: {channelCount: 1}});
+    })
+
+    function micConnect() {
+        if (micIsReconnecting) return;
+
+        micStatus("Connecting to server...");
+        micWs = new WebSocket(MIC_WSL_URL);
+
+        // Crucial: Tell the WebSocket to give us ArrayBuffers instead of Blobs
+        micWs.binaryType = 'arraybuffer';
+
+        micWs.onopen = () => {
+            micIsReconnecting = false;
+            micStatus("Connected! Waiting for sample rate...");
+            // Reset on a fresh connection
+            micSampleRate = null;
+        };
+
+        micWs.onmessage = async (event) => {
+            // Treat all messages as sample data
+            try {
+                let textData;
+                // Depending on your server, the first message might be text or binary-encoded text
+                if (event.data instanceof ArrayBuffer) {
+                    textData = new TextDecoder().decode(event.data);
+                } else {
+                    textData = event.data;
+                }
+
+                micSampleRate = parseInt(textData, 10);
+
+                if (isNaN(micSampleRate) || micSampleRate <= 0) {
+                    console.error("Invalid sample rate received");
+                    micSampleRate = null;
+                    return;
+                }
+
+                micStatus(`Recording voice at ${micSampleRate}Hz`);
+
+                if (micAudioCtx) micAudioCtx.close();
+
+                // Only create context if user has clicked, otherwise audio is blocked
+                if (userHasInteracted) {
+                    await startMic();
+                }
+            } catch (err) {
+                console.error("Failed to parse sample rate:", err);
+                micSampleRate = null; // Reset so the next message is treated as the sample rate
+            }
+        };
+
+        micWs.onclose = () => {
+            micStatus("Server offline. Retrying in 2 seconds...");
+            micSampleRate = null;
+            micWs = null;
+            micIsReconnecting = true;
+
+            if (micStream)
+                micStream.getTracks().forEach(track => track.stop());
+            micStream = null;
+
+            // Retry loop
+            setTimeout(() => {
+                micIsReconnecting = false;
+                // Only keep trying to connect automatically if the user has engaged
+                if (userHasInteracted) micConnect();
+            }, 2000);
+        };
+
+        micWs.onerror = (_err) => {
+            // Errors automatically trigger onclose, so we just let onclose handle the retry
+            micWs.close();
+        };
+    }
+
+    async function startMic() {
+        micAudioCtx = new window.AudioContext({
+            sampleRate: micSampleRate
+        });
+
+        if (!micStream) {
+            micStream = await navigator.mediaDevices.getUserMedia({audio: {channelCount: 1}});
+        }
+
+        const source = micAudioCtx.createMediaStreamSource(micStream);
+
+        const workletCode = `
+            class MicProcessor extends AudioWorkletProcessor {
+                // don't set any outputs --> mute
+                process(inputs, outputs, parameters) {
+                    const input = inputs[0];
+                    if (input && input.length > 0) {
+                        const channelData = input[0];
+                        // We must copy the data so we can transfer the buffer to the main thread
+                        const f32 = new Float32Array(channelData);
+                        this.port.postMessage(f32.buffer, [f32.buffer]);
+                    }
+                    return true;
+                }
+            }
+            registerProcessor('mic-processor', MicProcessor);
+            `;
+
+        const blob = new Blob([workletCode], {type: 'application/javascript'});
+        const workletUrl = URL.createObjectURL(blob);
+
+        await micAudioCtx.audioWorklet.addModule(workletUrl);
+        const workletNode = new AudioWorkletNode(micAudioCtx, "mic-processor");
+
+        workletNode.port.onmessage = (event) => {
+            // send raw f32
+            if (micWs && micWs.readyState === WebSocket.OPEN) {
+                micWs.send(event.data);
+            }
+        }
+
+        source.connect(workletNode);
+
+        // fully connect the graph for maximum browser compatibility.
+        // connect muted microphone to destination (likely speaker)
+        workletNode.connect(micAudioCtx.destination);
+    }
+
+    function micStatus(status) {
+        console.log("Microphone Status changed: ", status);
+        microphoneStatusEl.innerText = "Mic Status: " + status;
+    }
+</script>
+</body>
+</html>
\ No newline at end of file
diff --git a/tests/audio-websocket-speaker.html b/tests/audio-websocket-speaker.html
new file mode 100644
index 00000000..25afeca8
--- /dev/null
+++ b/tests/audio-websocket-speaker.html
@@ -0,0 +1,277 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>GLaDOS streamer</title>
+    <style>
+        body {
+            font-family: system-ui, -apple-system, sans-serif;
+            display: flex;
+            flex-direction: column;
+            align-items: center;
+            justify-content: center;
+            height: 100vh;
+            margin: 0;
+            background-color: #f4f4f9;
+            color: #333;
+        }
+
+        .status {
+            margin-bottom: 20px;
+            font-weight: bold;
+        }
+
+        button {
+            padding: 12px 24px;
+            font-size: 16px;
+            cursor: pointer;
+            background-color: #007bff;
+            color: white;
+            border: none;
+            border-radius: 6px;
+            transition: background-color 0.2s;
+        }
+
+        button:hover {
+            background-color: #0056b3;
+        }
+
+        button:disabled {
+            background-color: #cccccc;
+            cursor: not-allowed;
+        }
+    </style>
+</head>
+<body>
+
+<div id="speakerStatus" class="status">Speaker Status: Waiting to connect...</div>
+<button id="startBtn">Enable Speaker & Connect</button>
+
+<script>
+    const SPEAKER_WS_URL = 'ws://localhost:5050/speaker';
+    const speakerStatusEl = document.getElementById('speakerStatus');
+
+    const startBtn = document.getElementById('startBtn');
+
+    let speakerWs = null;
+    let speakerAudioCtx = null;
+    let speakerIsReconnecting = false;
+
+    let speakerCurrentAudio = null;
+    let speakerCurrentSampleRate = null;
+    let speakerCurrentPlayTimeMs = null;
+    let speakerGlobalTimeOffsetMs = 0;
+
+    let userHasInteracted = false;
+
+    // Browsers block audio until the user interacts with the document
+    startBtn.addEventListener('click', () => {
+        userHasInteracted = true;
+        startBtn.disabled = true;
+        startBtn.innerText = "Audio Enabled";
+
+        // If the socket was already looping/waiting, we just let it continue.
+        // If it hasn't started connecting yet, start it.
+        if (!speakerWs || speakerWs.readyState === WebSocket.CLOSED) {
+            speakerConnect();
+        }
+    });
+
+    function speakerConnect() {
+        if (speakerIsReconnecting) return;
+
+        speakerStatus("Connecting to server...");
+        speakerWs = new WebSocket(SPEAKER_WS_URL);
+
+        // Crucial: Tell the WebSocket to give us ArrayBuffers instead of Blobs
+        speakerWs.binaryType = 'arraybuffer';
+
+        speakerWs.onopen = () => {
+            speakerIsReconnecting = false;
+            speakerStatus("Connected! Waiting for sample rate...");
+            // Reset on a fresh connection
+            speakerCurrentSampleRate = null;
+            speakerCurrentPlayTimeMs = null;
+            speakerAudioCtx = null;
+            speakerCurrentAudio = null;
+        };
+
+        syncClockWithServer(speakerWs);
+
+        speakerWs.onmessage = (event) => {
+            // Control messages
+            if (typeof event.data === 'string') {
+                if (event.data.startsWith("sampleRate")) {
+                    let sampleRateStr = event.data.split(":")[1];
+
+                    speakerCurrentSampleRate = parseInt(sampleRateStr, 10);
+
+                    if (isNaN(speakerCurrentSampleRate) || speakerCurrentSampleRate <= 0) {
+                        console.error("Invalid sample rate received");
+                        speakerCurrentSampleRate = null;
+                        return;
+                    }
+
+                    speakerStatus(`Streaming audio at ${speakerCurrentSampleRate}Hz`);
+
+                    // Re-initialize AudioContext with the correct sample rate if needed
+                    if (speakerAudioCtx) speakerAudioCtx.close();
+
+                    // Only create context if user has clicked, otherwise audio is blocked
+                    if (userHasInteracted) {
+                        speakerAudioCtx = new window.AudioContext({
+                            sampleRate: speakerCurrentSampleRate
+                        });
+                    }
+                } else if (event.data.startsWith("time")) {
+                    let valueStr = event.data.split(":")[1];
+                    speakerCurrentPlayTimeMs = parseFloat(valueStr) * 1000
+                } else if (event.data === "reset" && speakerCurrentAudio) {
+                    speakerCurrentAudio.stop();
+                    speakerStatus("Stream reset");
+                }
+            }
+            // Audio output
+            else if (event.data instanceof ArrayBuffer) {
+                console.log("New audio")
+                if (speakerAudioCtx && speakerCurrentPlayTimeMs) {
+                    playAudioChunk(event.data, speakerCurrentPlayTimeMs);
+                }
+            }
+        };
+
+        speakerWs.onclose = () => {
+            speakerStatus("Server offline. Retrying in 1 seconds...");
+            speakerCurrentSampleRate = null;
+            speakerCurrentPlayTimeMs = null;
+            speakerWs = null;
+            speakerIsReconnecting = true;
+
+            // Retry loop
+            setTimeout(() => {
+                speakerIsReconnecting = false;
+                // Only keep trying to connect automatically if the user has engaged
+                if (userHasInteracted) speakerConnect();
+            }, 1000);
+        };
+
+        speakerWs.onerror = (_err) => {
+            // Errors automatically trigger onclose, so we just let onclose handle the retry
+            speakerWs.close();
+        };
+    }
+
+    function playAudioChunk(arrayBuffer, serverPlaytimeMs) {
+        if (!speakerAudioCtx) return;
+
+        // Convert the raw byte stream into a Float32Array
+        const float32Data = new Float32Array(arrayBuffer);
+        const frameCount = float32Data.length;
+
+        if (frameCount === 0) return;
+
+        // Create a Mono audio buffer
+        const audioBuffer = speakerAudioCtx.createBuffer(1, frameCount, speakerCurrentSampleRate);
+        audioBuffer.copyToChannel(float32Data, 0);
+
+        // Create an audio source node to play this specific chunk
+        speakerCurrentAudio = speakerAudioCtx.createBufferSource();
+        speakerCurrentAudio.buffer = audioBuffer;
+        speakerCurrentAudio.connect(speakerAudioCtx.destination);
+
+        // calculate time
+        const localPlayTime = serverPlaytimeMs - speakerGlobalTimeOffsetMs;
+        const secsUntilPlay = (localPlayTime - Date.now()) / 1000;
+
+        let playOffset = 0;
+        let audioCtxPlayTime = speakerAudioCtx.currentTime + secsUntilPlay;
+        if (secsUntilPlay < 0) {
+            // offset: if audio should play already, skip forward
+            playOffset = Math.abs(secsUntilPlay);
+            audioCtxPlayTime = 0;
+        }
+
+        // Schedule the chunk to play seamlessly after the previous one
+        speakerCurrentAudio.start(audioCtxPlayTime, playOffset);
+
+        // send ACK on end
+        speakerCurrentAudio.onended = () => {
+            if (speakerWs && speakerWs.readyState === WebSocket.OPEN) {
+                speakerWs.send("played");
+                speakerStatus("Current audio played.");
+            }
+        };
+
+        // Reset sample rate
+        speakerCurrentSampleRate = null;
+        speakerCurrentPlayTimeMs = null;
+    }
+
+    function speakerStatus(status) {
+        console.log("Speaker Status changed: ", status);
+        speakerStatusEl.innerText = "Speaker Status: " + status;
+    }
+
+    // get time offset between server and client for playback synchronization
+    function syncClockWithServer(ws) {
+        const totalPings = 5;
+        let pingCount = 0;
+        let offsets = [];
+        let pingStartTime;
+
+        // 1. Create a temporary message listener just for the sync process
+        const syncListener = (event) => {
+            // Ignore binary audio data for now
+            if (typeof event.data !== "string" || !event.data.startsWith("sync_pong:")) return;
+
+            const t3 = Date.now(); // Time the pong arrived
+            const serverTime = parseFloat(event.data.split(":")[1]) * 1000; // Convert Python seconds to JS milliseconds
+
+            // Calculate travel time and offset
+            const rtt = t3 - pingStartTime;
+            const oneWayLatency = rtt / 2;
+            const estimatedServerTime = serverTime + oneWayLatency;
+
+            const currentOffset = estimatedServerTime - t3;
+            offsets.push(currentOffset);
+
+            pingCount++;
+
+            if (pingCount < totalPings) {
+                // Send the next ping
+                sendPing();
+            } else {
+                // 2. We are done! Calculate the average offset and clean up
+                const sum = offsets.reduce((a, b) => a + b, 0);
+                speakerGlobalTimeOffsetMs = sum / offsets.length;
+
+                console.log(`Sync complete! Server is ${speakerGlobalTimeOffsetMs.toFixed(2)}ms ahead/behind.`);
+
+                // Remove this listener so it doesn't interfere with normal operations
+                ws.removeEventListener("message", syncListener);
+            }
+        };
+
+        ws.addEventListener("message", syncListener);
+
+        // Helper to fire the ping
+        function sendPing() {
+            pingStartTime = Date.now(); // t0: Time the ping leaves
+            ws.send("sync_ping");
+        }
+
+        function sendPingInit(i) {
+            if (ws.readyState === WebSocket.OPEN) {
+                sendPing();
+            } else {
+                setTimeout(() => sendPingInit(i + 1), 500);
+            }
+        }
+
+        // Start the process
+        sendPingInit(0);
+    }
+</script>
+</body>
+</html>
\ No newline at end of file

From ac0b0eeb4aaa2190a83d48e364365788f9cd9fa5 Mon Sep 17 00:00:00 2001
From: Elias <reisbauer03@proton.me>
Date: Mon, 30 Mar 2026 17:46:31 +0200
Subject: [PATCH 04/22] Websocket Audio: write documentation

---
 README.md                    |  11 +++
 README_WEBSOCKET_PROTOCOL.md | 139 +++++++++++++++++++++++++++++++++++
 2 files changed, 150 insertions(+)
 create mode 100644 README_WEBSOCKET_PROTOCOL.md

diff --git a/README.md b/README.md
index 113eb835..6f4aac8c 100644
--- a/README.md
+++ b/README.md
@@ -487,6 +487,17 @@ curl -X POST http://localhost:5050/v1/audio/speech \
   --output speech.mp3
 ```
 
+## Audio IO via websockets
+
+Audio Input/Output can be routed via Websockets.
+Multiple concurrent inputs/outputs are supported.
+GLaDos will speak via all outputs, the current microphone is automatically selected via VAD.
+You can use `tests/audio-websocket-both.html` to speak and hear GLaDOS.
+
+For configuration options, check out `configs/glados_websocket_config.yaml`.
+
+For an exact description of the websocket protocol, see `README_WEBSOCKET_PROTOCOL.md`.
+
 ## Troubleshooting
 
 > *"No one will blame you for giving up. In fact, quitting at this point is a perfectly reasonable response."  -  GLaDOS*
diff --git a/README_WEBSOCKET_PROTOCOL.md b/README_WEBSOCKET_PROTOCOL.md
new file mode 100644
index 00000000..6d8034f3
--- /dev/null
+++ b/README_WEBSOCKET_PROTOCOL.md
@@ -0,0 +1,139 @@
+# WebSocket Protocol
+
+This document describes the WebSocket endpoints and communication protocol used by the audio I/O system.
+
+## Server Configuration
+
+- **Host**: `0.0.0.0` (configurable)
+- **Port**: `5050` (configurable)
+- **Audio Sample Rate**: `16000 Hz` (16 kHz)
+- **Audio Format**: `float32` (NumPy dtype)
+
+## Configuration Options
+
+Use the `audio_io_options` key in `glados_config.yaml`.
+
+| Option                   | Type  | Default   | Description                                   |
+|--------------------------|-------|-----------|-----------------------------------------------|
+| `server`                 | str   | `0.0.0.0` | WebSocket listen address                      |
+| `port`                   | int   | `5050`    | WebSocket listen port                         |
+| `speaker_sync_delay_ms`  | int   | `250`     | Delay added to start time for speaker sync    |
+| `mic_max_silence_chunks` | int   | `10`      | Silent chunks before mic relinquishes control |
+| `vad_threshold`          | float | `0.8`     | VAD confidence threshold (0.0 - 1.0)          |
+
+## Endpoints
+
+### `/speaker` - Audio Playback Endpoint
+
+Used to stream audio from the server to a client for speaker playback.
+
+#### Server → Client Messages
+
+| Message Type | Format                  | Description                                                             |
+|--------------|-------------------------|-------------------------------------------------------------------------|
+| Audio Start  | `time:<unix_timestamp>` | Unix timestamp (`float`, in secs) indicating when playback should start |
+| Sample Rate  | `sampleRate:<hz>`       | Audio sample rate in Hz (e.g., `sampleRate:16000`)                      |
+| Audio Data   | Raw bytes               | Float32 audio samples (use `.tobytes()` to serialize)                   |
+
+#### Client → Server Messages
+
+| Message Type | Format      | Description                                                               |
+|--------------|-------------|---------------------------------------------------------------------------|
+| ACK          | `ACK`       | Signal that audio playback is complete                                    |
+| Sync Ping    | `sync_ping` | Request for synchronization; server responds with `sync_pong:<timestamp>` |
+
+#### Interruption Handling
+
+When audio playback is interrupted, the server sends:
+
+- `reset` - Signal to reset/clean up the playback session
+
+---
+
+### `/microphone` - Audio Capture Endpoint
+
+Used to stream microphone audio from a client to the server for Voice Activity Detection (VAD).
+
+#### Server → Client Messages
+
+| Message Type | Format            | Description                                                         |
+|--------------|-------------------|---------------------------------------------------------------------|
+| Sample Rate  | `sampleRate:<hz>` | Initial message; audio sample rate in Hz (e.g., `sampleRate:16000`) |
+
+#### Client → Server Messages
+
+| Message Type | Format    | Description                                      |
+|--------------|-----------|--------------------------------------------------|
+| Audio Data   | Raw bytes | Float32 audio samples (sent with `decode=False`) |
+
+#### VAD & Mic Control
+
+The server implements Voice Activity Detection (VAD) with the following behavior:
+
+- **VAD Threshold**: `0.8` (configurable)
+- **VAD Chunk Size**: `32 ms` (512 samples at 16 kHz)
+- **Max Silence Chunks**: `10` (microphone relinquishes control after 10 silent chunks)
+
+**Microphone Ownership Rules**:
+
+1. Multiple clients can connect to `/microphone`
+2. First client with VAD confidence > threshold takes control
+3. If current mic owner becomes silent (>=10 consecutive silent chunks), other clients with voice can take control
+4. On disconnect, a client relinquishes its mic control
+
+---
+
+## Implementation Notes
+
+### Audio Data Serialization
+
+**Python (Server)**:
+
+```python
+# Convert numpy array to bytes
+audio_bytes = audio_data.tobytes()
+```
+
+**Python (Client)**:
+
+```python
+# Convert bytes to numpy array
+audio_data = np.frombuffer(raw_bytes, dtype=np.float32)
+```
+
+### Message Flow Examples
+
+#### Speaker Endpoint Flow
+
+```
+Client connects to /speaker
+
+Server: time:1704067200.123
+Server: sampleRate:16000
+Server: <raw float32 audio bytes>
+Client: ACK
+```
+
+#### Microphone Endpoint Flow
+
+```
+Client connects to /microphone
+
+Server: sampleRate:16000
+
+Client: <raw float32 audio bytes>
+Client: <raw float32 audio bytes>
+Client: <raw float32 audio bytes>
+```
+
+### Synchronization
+
+For precise speaker synchronization, clients can use the sync ping/pong mechanism:
+
+```
+Client connects to /speaker
+
+Client: sync_ping
+Server: sync_pong:<timestamp>
+```
+

From 3b82dd755eccec4a4f8ad798682d58a4b2ca5a42 Mon Sep 17 00:00:00 2001
From: Elias <reisbauer03@proton.me>
Date: Mon, 30 Mar 2026 17:46:31 +0200
Subject: [PATCH 05/22] Websocket Audio: Support for room segregation: only
 play audios to speakers in the same room as the current microphone

---
 README_WEBSOCKET_PROTOCOL.md         | 43 +++++++++-----
 configs/glados_websocket_config.yaml |  2 +
 src/glados/audio_io/__init__.py      |  2 +
 src/glados/audio_io/websocket_io.py  | 85 ++++++++++++++++++++--------
 tests/audio-websocket-both.html      |  6 ++
 tests/audio-websocket-mic.html       | 17 ++++++
 tests/audio-websocket-speaker.html   | 17 ++++++
 7 files changed, 134 insertions(+), 38 deletions(-)

diff --git a/README_WEBSOCKET_PROTOCOL.md b/README_WEBSOCKET_PROTOCOL.md
index 6d8034f3..2be61288 100644
--- a/README_WEBSOCKET_PROTOCOL.md
+++ b/README_WEBSOCKET_PROTOCOL.md
@@ -13,13 +13,15 @@ This document describes the WebSocket endpoints and communication protocol used
 
 Use the `audio_io_options` key in `glados_config.yaml`.
 
-| Option                   | Type  | Default   | Description                                   |
-|--------------------------|-------|-----------|-----------------------------------------------|
-| `server`                 | str   | `0.0.0.0` | WebSocket listen address                      |
-| `port`                   | int   | `5050`    | WebSocket listen port                         |
-| `speaker_sync_delay_ms`  | int   | `250`     | Delay added to start time for speaker sync    |
-| `mic_max_silence_chunks` | int   | `10`      | Silent chunks before mic relinquishes control |
-| `vad_threshold`          | float | `0.8`     | VAD confidence threshold (0.0 - 1.0)          |
+| Option                   | Type  | Default   | Description                                                                                  |
+|--------------------------|-------|-----------|----------------------------------------------------------------------------------------------|
+| `server`                 | str   | `0.0.0.0` | WebSocket listen address                                                                     |
+| `port`                   | int   | `5050`    | WebSocket listen port                                                                        |
+| `speaker_sync_delay_ms`  | int   | `250`     | Delay added to start time for speaker sync                                                   |
+| `mic_max_silence_chunks` | int   | `10`      | Silent chunks before mic relinquishes control                                                |
+| `vad_threshold`          | float | `0.8`     | VAD confidence threshold (0.0 - 1.0)                                                         |
+| `default_room_tag`       | str   | `office`  | Default room tag when `room:<name>` message is not sent                                      |
+| `segregate_speakers`     | bool  | `False`   | If True, audio is only sent to speakers with the same room tag as the last active microphone |
 
 ## Endpoints
 
@@ -37,10 +39,20 @@ Used to stream audio from the server to a client for speaker playback.
 
 #### Client → Server Messages
 
-| Message Type | Format      | Description                                                               |
-|--------------|-------------|---------------------------------------------------------------------------|
-| ACK          | `ACK`       | Signal that audio playback is complete                                    |
-| Sync Ping    | `sync_ping` | Request for synchronization; server responds with `sync_pong:<timestamp>` |
+| Message Type | Format        | Description                                                                            |
+|--------------|---------------|----------------------------------------------------------------------------------------|
+| ACK          | `ACK`         | Signal that audio playback is complete                                                 |
+| Sync Ping    | `sync_ping`   | Request for synchronization; server responds with `sync_pong:<timestamp>`              |
+| Room         | `room:<name>` | Room/location tag for the device (optional; defaults to configurable value if not set) |
+
+#### Room Tag Segregation
+
+If the `segregate_speakers` option is enabled (`True`), audio playback is restricted to speakers whose room tag matches the room tag of the last active microphone:
+
+- When a microphone takes control, its room tag is recorded
+- Only speakers with a matching room tag will receive audio when `segregate_speakers=True`
+- Speakers with non-matching room tags will not receive audio (they may receive a `reset` message instead)
+- If `segregate_speakers=False` (default), audio is broadcast to all connected speakers regardless of room tag
 
 #### Interruption Handling
 
@@ -62,9 +74,10 @@ Used to stream microphone audio from a client to the server for Voice Activity D
 
 #### Client → Server Messages
 
-| Message Type | Format    | Description                                      |
-|--------------|-----------|--------------------------------------------------|
-| Audio Data   | Raw bytes | Float32 audio samples (sent with `decode=False`) |
+| Message Type | Format        | Description                                                                            |
+|--------------|---------------|----------------------------------------------------------------------------------------|
+| Audio Data   | Raw bytes     | Float32 audio samples (sent with `decode=False`)                                       |
+| Room         | `room:<name>` | Room/location tag for the device (optional; defaults to configurable value if not set) |
 
 #### VAD & Mic Control
 
@@ -107,6 +120,7 @@ audio_data = np.frombuffer(raw_bytes, dtype=np.float32)
 
 ```
 Client connects to /speaker
+Client: room:Living Room
 
 Server: time:1704067200.123
 Server: sampleRate:16000
@@ -119,6 +133,7 @@ Client: ACK
 ```
 Client connects to /microphone
 
+Client: room:Living Room
 Server: sampleRate:16000
 
 Client: <raw float32 audio bytes>
diff --git a/configs/glados_websocket_config.yaml b/configs/glados_websocket_config.yaml
index 687c75bb..38e32646 100644
--- a/configs/glados_websocket_config.yaml
+++ b/configs/glados_websocket_config.yaml
@@ -9,6 +9,8 @@ Glados:
     port: 5050
     speaker_sync_delay_ms: 250
     mic_max_silence_chunks: 10
+    default_room_tag: office
+    segregate_speakers: false
   input_mode: "audio"  # audio, text, or both
   tts_enabled: true
   asr_muted: false
diff --git a/src/glados/audio_io/__init__.py b/src/glados/audio_io/__init__.py
index 676d83f3..b3b52746 100644
--- a/src/glados/audio_io/__init__.py
+++ b/src/glados/audio_io/__init__.py
@@ -50,6 +50,8 @@ def get_audio_system(backend_type: str = "sounddevice", backend_options: dict[st
                 - port: Websocket listening port (default: 5050)
                 - speaker_sync_delay_ms: Milliseconds to add to each speak start time to account for speaker synchronisation (default: 250)
                 - mic_max_silence_chunks: How many consecutive VAD chunks must be silent so that the current microphone relinquishes control (default: 10)
+                - default_room_tag: The default room tag to use if a client doesn't set it (default: office)
+                - segregate_speakers: If `True`, audio is only sent to speakers with the same room tag as the last active microphone
         vad_threshold (float | None): Optional threshold for voice activity detection
 
     Returns:
diff --git a/src/glados/audio_io/websocket_io.py b/src/glados/audio_io/websocket_io.py
index 6c368eed..0543b697 100644
--- a/src/glados/audio_io/websocket_io.py
+++ b/src/glados/audio_io/websocket_io.py
@@ -24,6 +24,7 @@ class AudioData:
 
 @dataclass
 class MicState:
+    room: str
     current_id: uuid.UUID | None = None
     silence_chunks: int = 0
 
@@ -46,6 +47,8 @@ class WebsocketAudioIO:
     PORT: int = 5050  # websockets server port
     SPEAKER_SYNC_DELAY_MS: int = 250  # Milliseconds to add to start time to account for speaker synchronisation
     MIC_MAX_SILENCE_CHUNKS: int = 10  # how many VAD chunks must be silent for a mic to relinquish control
+    DEFAULT_ROOM_TAG: str = "office"
+    SEGREGATE_SPEAKERS: bool = False
 
     def __init__(self, vad_threshold: float | None = None, options: dict[str, Any] | None = None) -> None:
         """Initialize the sounddevice audio I/O.
@@ -73,6 +76,8 @@ def __init__(self, vad_threshold: float | None = None, options: dict[str, Any] |
         port: int = self.PORT
         self._speaker_sync_delay_ms: int = self.SPEAKER_SYNC_DELAY_MS
         self._mic_max_silence_chunks: int = self.MIC_MAX_SILENCE_CHUNKS
+        self._default_room_tag: str = self.DEFAULT_ROOM_TAG
+        self._segregate_speakers: bool = self.SEGREGATE_SPEAKERS
 
         if options is not None:
             for key in options:
@@ -86,6 +91,10 @@ def __init__(self, vad_threshold: float | None = None, options: dict[str, Any] |
                         self._speaker_sync_delay_ms = int(val)
                     case "mic_max_silence_chunks":
                         self._mic_max_silence_chunks = int(val)
+                    case "default_room_tag":
+                        self._default_room_tag = str(val)
+                    case "segregate_speakers":
+                        self._segregate_speakers = bool(val)
                     case _:
                         raise ValueError(f"Websocket backend: unsupported option '{key}'")
 
@@ -107,7 +116,7 @@ def __init__(self, vad_threshold: float | None = None, options: dict[str, Any] |
         self._is_listening = False
         # microphone state: lock initialized in self._run_server
         self._mic_state_lock: asyncio.Lock
-        self._mic_state = MicState()
+        self._mic_state = MicState(room=self._default_room_tag)
 
         self._server_thread = threading.Thread(target=lambda s, p: asyncio.run(self._run_server(s, p)), args=(server, port), daemon=True)
         self._server_thread.start()
@@ -146,11 +155,10 @@ def start_speaking(self, audio_data: NDArray[np.float32], sample_rate: int | Non
         self._playback_finished_event.clear()
 
         # Lock, set data, unlock
-        self._audio_lock.acquire()
-        # allow for network jitter, time to websocket send, etc.
-        play_time = time.time() + (self._speaker_sync_delay_ms / 1000)
-        self._audio_data = AudioData(audio_data, sample_rate, play_time)
-        self._audio_lock.release()
+        with self._audio_lock:
+            # allow for network jitter, time to websocket send, etc.
+            play_time = time.time() + (self._speaker_sync_delay_ms / 1000)
+            self._audio_data = AudioData(audio_data, sample_rate, play_time)
 
         self._stop_playback = False
         self._is_playing = True
@@ -275,48 +283,68 @@ async def _server_speaker(self, websocket: websockets.ServerConnection) -> None:
             websocket: Websocket connection
         """
 
+        room = self._default_room_tag
+
+        async def handle_default_msg(ws_msg: str | bytes) -> bool:
+            """Handle the default ws messages. Returns True if the message is not a default message"""
+            if ws_msg == "sync_ping":
+                await websocket.send(f"sync_pong:{time.time()}")
+                return False
+            elif type(ws_msg) == str and ws_msg.startswith("room"):
+                nonlocal room
+                room = ws_msg.split(":")[1]
+                return False
+            return True
+
         while True:
             # 1. IDLE LOOP: Check for play state, but listen for sync pings in the meantime!
             while not self._is_playing:
                 try:
                     # Wait for a message, but timeout quickly to check self._is_playing again
                     message = await asyncio.wait_for(websocket.recv(), timeout=0.05)
-                    if message == "sync_ping":
-                        await websocket.send(f"sync_pong:{time.time()}")
+                    await handle_default_msg(message)
                 except asyncio.TimeoutError:
                     continue  # Timeout expected, loop back to check `self._is_playing`
                 except websockets.exceptions.ConnectionClosed:
                     return  # Client disconnected, exit the handler safely
 
+            # check room
+            if self._segregate_speakers:
+                async with self._mic_state_lock:
+                    target_room = self._mic_state.room
+                if target_room != room:
+                    # wait for the current playback to finish, but don't send Audio
+                    while self._is_playing:
+                        try:
+                            message = await asyncio.wait_for(websocket.recv(), timeout=0.05)
+                            await handle_default_msg(message)
+                        except asyncio.TimeoutError:
+                            continue
+                        except websockets.exceptions.ConnectionClosed:
+                            return
+                    continue
+
             # 2. AUDIO SEND PHASE
             # We acquire the lock just long enough to grab the data safely.
-            self._audio_lock.acquire()
             try:
-                # Send timestamp, then sample rate, then bytes
-                await websocket.send("time:" + str(self._audio_data.play_time))
-                await websocket.send("sampleRate:" + str(self._audio_data.sample_rate))
-                await websocket.send(self._audio_data.data.tobytes())
+                with self._audio_lock:
+                    # Send timestamp, then sample rate, then bytes
+                    await websocket.send("time:" + str(self._audio_data.play_time))
+                    await websocket.send("sampleRate:" + str(self._audio_data.sample_rate))
+                    await websocket.send(self._audio_data.data.tobytes())
 
-                logger.debug(f"Playing audio with sample rate: {self._audio_data.sample_rate} Hz, length: {len(self._audio_data.data)} samples")
+                    logger.debug(f"Playing audio with sample rate: {self._audio_data.sample_rate} Hz, length: {len(self._audio_data.data)} samples")
             except websockets.exceptions.ConnectionClosed:
                 self._playback_was_interrupted = True
                 self._is_playing = False
-                self._audio_lock.release()
                 self._playback_finished_event.set()
                 break
-            finally:
-                # CRITICAL: Release the lock immediately after sending!
-                # Do not hold it while waiting for the client to play.
-                if self._audio_lock.locked():
-                    self._audio_lock.release()
 
             # 3. WAITING PHASE
             while not self._stop_playback:
                 try:
                     message = await asyncio.wait_for(websocket.recv(), timeout=0.05)
-                    if message == "sync_ping":
-                        await websocket.send(f"sync_pong:{time.time()}")
-                    else:
+                    if await handle_default_msg(message):
                         # got ACK
                         logger.debug("Websocket: Audio played fully")
                         self._playback_was_interrupted = False
@@ -350,6 +378,8 @@ async def _server_microphone(self, websocket: websockets.ServerConnection) -> No
         vad_needed_samples = self.SAMPLE_RATE * self.VAD_SIZE // 1000
         # currently stored samples
         current_data = np.empty((0,), dtype=np.float32)
+        # room of the mic
+        room = self._default_room_tag
 
         async def relinquish():
             async with self._mic_state_lock:
@@ -365,10 +395,14 @@ async def relinquish():
         while True:
             # wait for audio
             try:
-                msg = await websocket.recv(decode=False)
+                msg = await websocket.recv()
             except websockets.exceptions.ConnectionClosed:
                 break
 
+            if type(msg) == str and msg.startswith("room"):
+                room = msg.split(":")[1]
+                continue
+
             if self._is_listening:
                 # append to current_data
                 data = np.frombuffer(msg, dtype=np.float32)
@@ -397,8 +431,11 @@ async def relinquish():
                         # If we have control, put sample on queue
                         if self._mic_state.current_id == client_id:
                             self._sample_queue.put((vad_data, bool(vad_confidence)))
+                            # always update room; a message could change it at any time
+                            self._mic_state.room = room
 
                             if vad_confidence:
+                                # also acts as init
                                 self._mic_state.silence_chunks = 0
                             else:
                                 self._mic_state.silence_chunks += 1
diff --git a/tests/audio-websocket-both.html b/tests/audio-websocket-both.html
index c0714f59..35056849 100644
--- a/tests/audio-websocket-both.html
+++ b/tests/audio-websocket-both.html
@@ -16,6 +16,12 @@
             background-color: #f4f4f9;
             color: #333;
         }
+
+        iframe {
+            width: 100%;
+            height: 100%;
+            border: none;
+        }
     </style>
 </head>
 <body>
diff --git a/tests/audio-websocket-mic.html b/tests/audio-websocket-mic.html
index 0576dc5e..7c9b65c1 100644
--- a/tests/audio-websocket-mic.html
+++ b/tests/audio-websocket-mic.html
@@ -41,6 +41,11 @@
             background-color: #cccccc;
             cursor: not-allowed;
         }
+
+        input {
+            margin-top: 15px;
+            margin-bottom: 5px;
+        }
     </style>
 </head>
 <body>
@@ -48,11 +53,15 @@
 <div id="microphoneStatus" class="status">Mic Status: Waiting to connect...</div>
 <button id="startBtn">Enable Mic & Connect</button>
 
+<input type="text" id="room"/>
+<button id="roomBtn">Set room</button>
+
 <script>
     const MIC_WSL_URL = 'ws://localhost:5050/microphone';
     const microphoneStatusEl = document.getElementById('microphoneStatus');
 
     const startBtn = document.getElementById('startBtn');
+    const roomBtn = document.getElementById('roomBtn');
 
     let micWs = null;
     let micSampleRate = null;
@@ -202,6 +211,14 @@
         console.log("Microphone Status changed: ", status);
         microphoneStatusEl.innerText = "Mic Status: " + status;
     }
+
+    roomBtn.addEventListener('click', () => {
+       let room = document.getElementById('room').value;
+
+       if (micWs && micWs.readyState === WebSocket.OPEN) {
+           micWs.send("room:" + room);
+       }
+    });
 </script>
 </body>
 </html>
\ No newline at end of file
diff --git a/tests/audio-websocket-speaker.html b/tests/audio-websocket-speaker.html
index 25afeca8..d3eb2dd1 100644
--- a/tests/audio-websocket-speaker.html
+++ b/tests/audio-websocket-speaker.html
@@ -41,6 +41,11 @@
             background-color: #cccccc;
             cursor: not-allowed;
         }
+
+        input {
+            margin-top: 15px;
+            margin-bottom: 5px;
+        }
     </style>
 </head>
 <body>
@@ -48,11 +53,15 @@
 <div id="speakerStatus" class="status">Speaker Status: Waiting to connect...</div>
 <button id="startBtn">Enable Speaker & Connect</button>
 
+<input type="text" id="room"/>
+<button id="roomBtn">Set room</button>
+
 <script>
     const SPEAKER_WS_URL = 'ws://localhost:5050/speaker';
     const speakerStatusEl = document.getElementById('speakerStatus');
 
     const startBtn = document.getElementById('startBtn');
+    const roomBtn = document.getElementById('roomBtn');
 
     let speakerWs = null;
     let speakerAudioCtx = null;
@@ -272,6 +281,14 @@
         // Start the process
         sendPingInit(0);
     }
+
+    roomBtn.addEventListener('click', () => {
+       let room = document.getElementById('room').value;
+
+       if (speakerWs && speakerWs.readyState === WebSocket.OPEN) {
+           speakerWs.send("room:" + room);
+       }
+    });
 </script>
 </body>
 </html>
\ No newline at end of file

From 421fc74a27ced6c5492f328c6a6ce3e7de9e9da5 Mon Sep 17 00:00:00 2001
From: Elias <reisbauer03@proton.me>
Date: Mon, 30 Mar 2026 17:58:26 +0200
Subject: [PATCH 06/22] Websocket Audio: HTML test file: better browser
 compatibility

---
 tests/audio-websocket-mic.html | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tests/audio-websocket-mic.html b/tests/audio-websocket-mic.html
index 7c9b65c1..a0a4e94c 100644
--- a/tests/audio-websocket-mic.html
+++ b/tests/audio-websocket-mic.html
@@ -204,7 +204,11 @@
 
         // fully connect the graph for maximum browser compatibility.
         // connect muted microphone to destination (likely speaker)
-        workletNode.connect(micAudioCtx.destination);
+        const silentSink = micAudioCtx.createGain();
+        silentSink.gain.value = 0;
+
+        workletNode.connect(silentSink);
+        silentSink.connect(micAudioCtx.destination);
     }
 
     function micStatus(status) {

From c1872a0cdce3922628c1bacad6ed07dccde3b346 Mon Sep 17 00:00:00 2001
From: Elias <reisbauer03@proton.me>
Date: Mon, 30 Mar 2026 19:23:42 +0200
Subject: [PATCH 07/22] Websocket Audio: Fix issues raised by code rabbit.

---
 README_WEBSOCKET_PROTOCOL.md        |  2 +-
 src/glados/audio_io/websocket_io.py | 22 ++++++++++++++--------
 tests/audio-websocket-both.html     |  2 +-
 3 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/README_WEBSOCKET_PROTOCOL.md b/README_WEBSOCKET_PROTOCOL.md
index 2be61288..d2bd883b 100644
--- a/README_WEBSOCKET_PROTOCOL.md
+++ b/README_WEBSOCKET_PROTOCOL.md
@@ -41,7 +41,7 @@ Used to stream audio from the server to a client for speaker playback.
 
 | Message Type | Format        | Description                                                                            |
 |--------------|---------------|----------------------------------------------------------------------------------------|
-| ACK          | `ACK`         | Signal that audio playback is complete                                                 |
+| ACK          | `played`      | Signal that audio playback is complete                                                 |
 | Sync Ping    | `sync_ping`   | Request for synchronization; server responds with `sync_pong:<timestamp>`              |
 | Room         | `room:<name>` | Room/location tag for the device (optional; defaults to configurable value if not set) |
 
diff --git a/src/glados/audio_io/websocket_io.py b/src/glados/audio_io/websocket_io.py
index 0543b697..a07bc06e 100644
--- a/src/glados/audio_io/websocket_io.py
+++ b/src/glados/audio_io/websocket_io.py
@@ -17,6 +17,9 @@
 
 @dataclass
 class AudioData:
+    """
+    Audio Data. Encapsulated here for synchronization.
+    """
     data: NDArray[np.float32]
     sample_rate: int
     play_time: float
@@ -24,6 +27,10 @@ class AudioData:
 
 @dataclass
 class MicState:
+    """
+    Microphone State.
+    Encapsulated here for synchronization.
+    """
     room: str
     current_id: uuid.UUID | None = None
     silence_chunks: int = 0
@@ -33,10 +40,10 @@ def inactive(self, max_silence_chunks: int):
 
 
 class WebsocketAudioIO:
-    """Audio I/O implementation using sounddevice for both input and output.
+    """Audio I/O implementation using websockets for both input and output.
 
     This class provides an implementation of the AudioIO interface using the
-    sounddevice library to interact with system audio devices. It handles
+    websockets library to interact with remote clients. It handles
     real-time audio capture with voice activity detection and audio playback.
     """
 
@@ -47,8 +54,8 @@ class WebsocketAudioIO:
     PORT: int = 5050  # websockets server port
     SPEAKER_SYNC_DELAY_MS: int = 250  # Milliseconds to add to start time to account for speaker synchronisation
     MIC_MAX_SILENCE_CHUNKS: int = 10  # how many VAD chunks must be silent for a mic to relinquish control
-    DEFAULT_ROOM_TAG: str = "office"
-    SEGREGATE_SPEAKERS: bool = False
+    DEFAULT_ROOM_TAG: str = "office" # default room tag
+    SEGREGATE_SPEAKERS: bool = False # default value for speaker segregation.
 
     def __init__(self, vad_threshold: float | None = None, options: dict[str, Any] | None = None) -> None:
         """Initialize the sounddevice audio I/O.
@@ -290,7 +297,7 @@ async def handle_default_msg(ws_msg: str | bytes) -> bool:
             if ws_msg == "sync_ping":
                 await websocket.send(f"sync_pong:{time.time()}")
                 return False
-            elif type(ws_msg) == str and ws_msg.startswith("room"):
+            elif isinstance(ws_msg, str) and ws_msg.startswith("room"):
                 nonlocal room
                 room = ws_msg.split(":")[1]
                 return False
@@ -344,8 +351,7 @@ async def handle_default_msg(ws_msg: str | bytes) -> bool:
             while not self._stop_playback:
                 try:
                     message = await asyncio.wait_for(websocket.recv(), timeout=0.05)
-                    if await handle_default_msg(message):
-                        # got ACK
+                    if await handle_default_msg(message) and message == "played":
                         logger.debug("Websocket: Audio played fully")
                         self._playback_was_interrupted = False
                         break
@@ -399,7 +405,7 @@ async def relinquish():
             except websockets.exceptions.ConnectionClosed:
                 break
 
-            if type(msg) == str and msg.startswith("room"):
+            if isinstance(msg, str) and msg.startswith("room"):
                 room = msg.split(":")[1]
                 continue
 
diff --git a/tests/audio-websocket-both.html b/tests/audio-websocket-both.html
index 35056849..380f26ce 100644
--- a/tests/audio-websocket-both.html
+++ b/tests/audio-websocket-both.html
@@ -3,7 +3,7 @@
 <head>
     <meta charset="UTF-8">
     <meta name="viewport" content="width=device-width, initial-scale=1.0">
-    <title>GLaDOS Microphone</title>
+    <title>GLaDOS Speaker and Microphone</title>
     <style>
         body {
             font-family: system-ui, -apple-system, sans-serif;

From d169799d295de6f0bb513750c18197c900e4461f Mon Sep 17 00:00:00 2001
From: Elias <reisbauer03@proton.me>
Date: Mon, 30 Mar 2026 20:05:18 +0200
Subject: [PATCH 08/22] Websocket audio: secure server default, fix docs,
 timeout if waiting, protocol and validation fixes (review by coderabbit)

---
 README_WEBSOCKET_PROTOCOL.md         | 30 ++++++++---------
 configs/glados_websocket_config.yaml |  4 +--
 src/glados/audio_io/__init__.py      |  4 +--
 src/glados/audio_io/websocket_io.py  | 49 +++++++++++++++-------------
 tests/audio-websocket-mic.html       | 45 ++++++++++++-------------
 tests/audio-websocket-speaker.html   |  4 +--
 6 files changed, 68 insertions(+), 68 deletions(-)

diff --git a/README_WEBSOCKET_PROTOCOL.md b/README_WEBSOCKET_PROTOCOL.md
index d2bd883b..d7914f73 100644
--- a/README_WEBSOCKET_PROTOCOL.md
+++ b/README_WEBSOCKET_PROTOCOL.md
@@ -4,8 +4,8 @@ This document describes the WebSocket endpoints and communication protocol used
 
 ## Server Configuration
 
-- **Host**: `0.0.0.0` (configurable)
-- **Port**: `5050` (configurable)
+- **Host**: `127.0.0.1` (configurable)
+- **Port**: `5051` (configurable)
 - **Audio Sample Rate**: `16000 Hz` (16 kHz)
 - **Audio Format**: `float32` (NumPy dtype)
 
@@ -13,15 +13,15 @@ This document describes the WebSocket endpoints and communication protocol used
 
 Use the `audio_io_options` key in `glados_config.yaml`.
 
-| Option                   | Type  | Default   | Description                                                                                  |
-|--------------------------|-------|-----------|----------------------------------------------------------------------------------------------|
-| `server`                 | str   | `0.0.0.0` | WebSocket listen address                                                                     |
-| `port`                   | int   | `5050`    | WebSocket listen port                                                                        |
-| `speaker_sync_delay_ms`  | int   | `250`     | Delay added to start time for speaker sync                                                   |
-| `mic_max_silence_chunks` | int   | `10`      | Silent chunks before mic relinquishes control                                                |
-| `vad_threshold`          | float | `0.8`     | VAD confidence threshold (0.0 - 1.0)                                                         |
-| `default_room_tag`       | str   | `office`  | Default room tag when `room:<name>` message is not sent                                      |
-| `segregate_speakers`     | bool  | `False`   | If True, audio is only sent to speakers with the same room tag as the last active microphone |
+| Option                   | Type  | Default     | Description                                                                                  |
+|--------------------------|-------|-------------|----------------------------------------------------------------------------------------------|
+| `server`                 | str   | `127.0.0.1` | WebSocket listen address                                                                     |
+| `port`                   | int   | `5051`      | WebSocket listen port                                                                        |
+| `speaker_sync_delay_ms`  | int   | `250`       | Delay added to start time for speaker sync                                                   |
+| `mic_max_silence_chunks` | int   | `10`        | Silent chunks before mic relinquishes control                                                |
+| `vad_threshold`          | float | `0.8`       | VAD confidence threshold (0.0 - 1.0)                                                         |
+| `default_room_tag`       | str   | `office`    | Default room tag when `room:<name>` message is not sent                                      |
+| `segregate_speakers`     | bool  | `False`     | If True, audio is only sent to speakers with the same room tag as the last active microphone |
 
 ## Endpoints
 
@@ -118,19 +118,19 @@ audio_data = np.frombuffer(raw_bytes, dtype=np.float32)
 
 #### Speaker Endpoint Flow
 
-```
+```text
 Client connects to /speaker
 Client: room:Living Room
 
 Server: time:1704067200.123
 Server: sampleRate:16000
 Server: <raw float32 audio bytes>
-Client: ACK
+Client: played
 ```
 
 #### Microphone Endpoint Flow
 
-```
+```text
 Client connects to /microphone
 
 Client: room:Living Room
@@ -145,7 +145,7 @@ Client: <raw float32 audio bytes>
 
 For precise speaker synchronization, clients can use the sync ping/pong mechanism:
 
-```
+```text
 Client connects to /speaker
 
 Client: sync_ping
diff --git a/configs/glados_websocket_config.yaml b/configs/glados_websocket_config.yaml
index 38e32646..39b439e4 100644
--- a/configs/glados_websocket_config.yaml
+++ b/configs/glados_websocket_config.yaml
@@ -5,8 +5,8 @@ Glados:
   interruptible: true
   audio_io: "websocket"
   audio_io_options:
-    server: 0.0.0.0
-    port: 5050
+    server: 127.0.0.1
+    port: 5051
     speaker_sync_delay_ms: 250
     mic_max_silence_chunks: 10
     default_room_tag: office
diff --git a/src/glados/audio_io/__init__.py b/src/glados/audio_io/__init__.py
index b3b52746..38b235fc 100644
--- a/src/glados/audio_io/__init__.py
+++ b/src/glados/audio_io/__init__.py
@@ -46,8 +46,8 @@ def get_audio_system(backend_type: str = "sounddevice", backend_options: dict[st
         backend_options: Options for the specified backend.
             - "sounddevice": No options are allowed.
             - "websocket": The following options are allowed:
-                - server: Websocket listening address (default: 0.0.0.0)
-                - port: Websocket listening port (default: 5050)
+                - server: Websocket listening address (default: 127.0.0.1)
+                - port: Websocket listening port (default: 5051)
                 - speaker_sync_delay_ms: Milliseconds to add to each speak start time to account for speaker synchronisation (default: 250)
                 - mic_max_silence_chunks: How many consecutive VAD chunks must be silent so that the current microphone relinquishes control (default: 10)
                 - default_room_tag: The default room tag to use if a client doesn't set it (default: office)
diff --git a/src/glados/audio_io/websocket_io.py b/src/glados/audio_io/websocket_io.py
index a07bc06e..a41861a1 100644
--- a/src/glados/audio_io/websocket_io.py
+++ b/src/glados/audio_io/websocket_io.py
@@ -50,21 +50,21 @@ class WebsocketAudioIO:
     SAMPLE_RATE: int = 16000  # Sample rate for input stream
     VAD_SIZE: int = 32  # Milliseconds of sample for Voice Activity Detection (VAD)
     VAD_THRESHOLD: float = 0.8  # Threshold for VAD detection
-    SERVER: str = "0.0.0.0"  # websockets server listen address
-    PORT: int = 5050  # websockets server port
+    SERVER: str = "127.0.0.1"  # websockets server listen address
+    PORT: int = 5051  # websockets server port
     SPEAKER_SYNC_DELAY_MS: int = 250  # Milliseconds to add to start time to account for speaker synchronisation
     MIC_MAX_SILENCE_CHUNKS: int = 10  # how many VAD chunks must be silent for a mic to relinquish control
     DEFAULT_ROOM_TAG: str = "office" # default room tag
     SEGREGATE_SPEAKERS: bool = False # default value for speaker segregation.
 
     def __init__(self, vad_threshold: float | None = None, options: dict[str, Any] | None = None) -> None:
-        """Initialize the sounddevice audio I/O.
+        """Initialize the websocket audio I/O.
 
         Args:
             vad_threshold: Threshold for VAD detection (default: 0.8)
             options: backend options
-              - server: Websocket listening address (default: 0.0.0.0)
-              - port: Websocket listening port (default: 5050)
+              - server: Websocket listening address (default: 127.0.0.1)
+              - port: Websocket listening port (default: 5051)
               - speaker_sync_delay_ms: Milliseconds to add to each speak start time to account for speaker synchronisation (default: 250)
               - mic_max_silence_chunks: How many consecutive VAD chunks must be silent so that the current microphone relinquishes control (default: 10)
 
@@ -173,7 +173,8 @@ def start_speaking(self, audio_data: NDArray[np.float32], sample_rate: int | Non
         logger.debug("Scheduled audio playback")
 
         if wait:
-            self._playback_finished_event.wait()
+            max_timeout = (len(audio_data) / sample_rate) + (self._speaker_sync_delay_ms / 1000.0) + 1.0
+            self._playback_finished_event.wait(timeout=max_timeout)
 
     def measure_percentage_spoken(self, total_samples: int, sample_rate: int | None = None) -> tuple[bool, int]:
         """
@@ -197,10 +198,10 @@ def measure_percentage_spoken(self, total_samples: int, sample_rate: int | None
         self._playback_was_interrupted = False
 
         # wait for finish
-        max_timeout = total_samples / sample_rate
+        max_timeout = (total_samples / sample_rate) + (self._speaker_sync_delay_ms / 1000.0) + 1.0
 
         now = time.monotonic()
-        completed = self._playback_finished_event.wait(max_timeout + 1)
+        completed = self._playback_finished_event.wait(max_timeout)
         interrupted = self._playback_was_interrupted
         elapsed = time.monotonic() - now
 
@@ -297,9 +298,9 @@ async def handle_default_msg(ws_msg: str | bytes) -> bool:
             if ws_msg == "sync_ping":
                 await websocket.send(f"sync_pong:{time.time()}")
                 return False
-            elif isinstance(ws_msg, str) and ws_msg.startswith("room"):
+            elif isinstance(ws_msg, str) and ws_msg.startswith("room:"):
                 nonlocal room
-                room = ws_msg.split(":")[1]
+                room = ws_msg.split(":", maxsplit=1)[1]
                 return False
             return True
 
@@ -335,12 +336,17 @@ async def handle_default_msg(ws_msg: str | bytes) -> bool:
             # We acquire the lock just long enough to grab the data safely.
             try:
                 with self._audio_lock:
-                    # Send timestamp, then sample rate, then bytes
-                    await websocket.send("time:" + str(self._audio_data.play_time))
-                    await websocket.send("sampleRate:" + str(self._audio_data.sample_rate))
-                    await websocket.send(self._audio_data.data.tobytes())
+                    play_time = self._audio_data.play_time
+                    sample_rate = self._audio_data.sample_rate
+                    audio_data_bytes = self._audio_data.data.tobytes()
+                    sample_count = len(self._audio_data.data)
 
-                    logger.debug(f"Playing audio with sample rate: {self._audio_data.sample_rate} Hz, length: {len(self._audio_data.data)} samples")
+                # Send timestamp, then sample rate, then bytes
+                await websocket.send("time:" + str(play_time))
+                await websocket.send("sampleRate:" + str(sample_rate))
+                await websocket.send(audio_data_bytes)
+
+                logger.debug(f"Playing audio with sample rate: {sample_rate} Hz, length: {sample_count} samples")
             except websockets.exceptions.ConnectionClosed:
                 self._playback_was_interrupted = True
                 self._is_playing = False
@@ -394,7 +400,7 @@ async def relinquish():
 
         # send sample rate
         try:
-            await websocket.send(str(self.SAMPLE_RATE))
+            await websocket.send("sampleRate:" + str(self.SAMPLE_RATE))
         except websockets.exceptions.ConnectionClosed:
             return
 
@@ -405,11 +411,9 @@ async def relinquish():
             except websockets.exceptions.ConnectionClosed:
                 break
 
-            if isinstance(msg, str) and msg.startswith("room"):
-                room = msg.split(":")[1]
-                continue
-
-            if self._is_listening:
+            if isinstance(msg, str) and msg.startswith("room:"):
+                room = msg.split(":", maxsplit=1)[1]
+            elif isinstance(msg, bytes) and self._is_listening:
                 # append to current_data
                 data = np.frombuffer(msg, dtype=np.float32)
                 current_data = np.append(current_data, data)
@@ -445,7 +449,8 @@ async def relinquish():
                                 self._mic_state.silence_chunks = 0
                             else:
                                 self._mic_state.silence_chunks += 1
-            else:
+
+            if not self._is_listening:
                 # reset when not listening
                 current_data = np.empty((0,), dtype=np.float32)
                 vad_model.reset_states()
diff --git a/tests/audio-websocket-mic.html b/tests/audio-websocket-mic.html
index a0a4e94c..973d32a5 100644
--- a/tests/audio-websocket-mic.html
+++ b/tests/audio-websocket-mic.html
@@ -57,7 +57,7 @@
 <button id="roomBtn">Set room</button>
 
 <script>
-    const MIC_WSL_URL = 'ws://localhost:5050/microphone';
+    const MIC_WSL_URL = 'ws://localhost:5051/microphone';
     const microphoneStatusEl = document.getElementById('microphoneStatus');
 
     const startBtn = document.getElementById('startBtn');
@@ -105,29 +105,24 @@
         micWs.onmessage = async (event) => {
             // Treat all messages as sample data
             try {
-                let textData;
-                // Depending on your server, the first message might be text or binary-encoded text
-                if (event.data instanceof ArrayBuffer) {
-                    textData = new TextDecoder().decode(event.data);
-                } else {
-                    textData = event.data;
-                }
-
-                micSampleRate = parseInt(textData, 10);
-
-                if (isNaN(micSampleRate) || micSampleRate <= 0) {
-                    console.error("Invalid sample rate received");
-                    micSampleRate = null;
-                    return;
-                }
+                if (typeof event.data === 'string' && event.data.startsWith("sampleRate:")) {
+                    let sampleRateStr = event.data.split(":")[1];
+                    micSampleRate = parseInt(sampleRateStr, 10);
+
+                    if (isNaN(micSampleRate) || micSampleRate <= 0) {
+                        console.error("Invalid sample rate received");
+                        micSampleRate = null;
+                        return;
+                    }
 
-                micStatus(`Recording voice at ${micSampleRate}Hz`);
+                    micStatus(`Recording voice at ${micSampleRate}Hz`);
 
-                if (micAudioCtx) micAudioCtx.close();
+                    if (micAudioCtx) micAudioCtx.close();
 
-                // Only create context if user has clicked, otherwise audio is blocked
-                if (userHasInteracted) {
-                    await startMic();
+                    // Only create context if user has clicked, otherwise audio is blocked
+                    if (userHasInteracted) {
+                        await startMic();
+                    }
                 }
             } catch (err) {
                 console.error("Failed to parse sample rate:", err);
@@ -217,11 +212,11 @@
     }
 
     roomBtn.addEventListener('click', () => {
-       let room = document.getElementById('room').value;
+        let room = document.getElementById('room').value;
 
-       if (micWs && micWs.readyState === WebSocket.OPEN) {
-           micWs.send("room:" + room);
-       }
+        if (micWs && micWs.readyState === WebSocket.OPEN) {
+            micWs.send("room:" + room);
+        }
     });
 </script>
 </body>
diff --git a/tests/audio-websocket-speaker.html b/tests/audio-websocket-speaker.html
index d3eb2dd1..56696074 100644
--- a/tests/audio-websocket-speaker.html
+++ b/tests/audio-websocket-speaker.html
@@ -57,7 +57,7 @@
 <button id="roomBtn">Set room</button>
 
 <script>
-    const SPEAKER_WS_URL = 'ws://localhost:5050/speaker';
+    const SPEAKER_WS_URL = 'ws://localhost:5051/speaker';
     const speakerStatusEl = document.getElementById('speakerStatus');
 
     const startBtn = document.getElementById('startBtn');
@@ -111,7 +111,7 @@
         speakerWs.onmessage = (event) => {
             // Control messages
             if (typeof event.data === 'string') {
-                if (event.data.startsWith("sampleRate")) {
+                if (event.data.startsWith("sampleRate:")) {
                     let sampleRateStr = event.data.split(":")[1];
 
                     speakerCurrentSampleRate = parseInt(sampleRateStr, 10);

From 92c6b0b0b475183e7db25266c5d9a6ab261c68e1 Mon Sep 17 00:00:00 2001
From: Elias <reisbauer03@proton.me>
Date: Mon, 30 Mar 2026 20:34:50 +0200
Subject: [PATCH 09/22] Websocket audio: cleanup test HTML (review by
 coderabbit)

---
 tests/audio-websocket-mic.html     | 11 +++++++++-
 tests/audio-websocket-speaker.html | 33 +++++++++++++++++++++---------
 2 files changed, 33 insertions(+), 11 deletions(-)

diff --git a/tests/audio-websocket-mic.html b/tests/audio-websocket-mic.html
index 973d32a5..68108423 100644
--- a/tests/audio-websocket-mic.html
+++ b/tests/audio-websocket-mic.html
@@ -86,6 +86,7 @@
         micStream = await navigator.mediaDevices.getUserMedia({audio: {channelCount: 1}});
     })
 
+    // connect microphone websocket
     function micConnect() {
         if (micIsReconnecting) return;
 
@@ -100,6 +101,7 @@
             micStatus("Connected! Waiting for sample rate...");
             // Reset on a fresh connection
             micSampleRate = null;
+            sendRoom();
         };
 
         micWs.onmessage = async (event) => {
@@ -154,6 +156,7 @@
         };
     }
 
+    // all variables are filled: start recording
     async function startMic() {
         micAudioCtx = new window.AudioContext({
             sampleRate: micSampleRate
@@ -206,17 +209,23 @@
         silentSink.connect(micAudioCtx.destination);
     }
 
+    // set status HTML div, also log to console
     function micStatus(status) {
         console.log("Microphone Status changed: ", status);
         microphoneStatusEl.innerText = "Mic Status: " + status;
     }
 
-    roomBtn.addEventListener('click', () => {
+    // send the room data if the WS is open
+    function sendRoom() {
         let room = document.getElementById('room').value;
 
         if (micWs && micWs.readyState === WebSocket.OPEN) {
             micWs.send("room:" + room);
         }
+    }
+
+    roomBtn.addEventListener('click', () => {
+        sendRoom();
     });
 </script>
 </body>
diff --git a/tests/audio-websocket-speaker.html b/tests/audio-websocket-speaker.html
index 56696074..a2a8688e 100644
--- a/tests/audio-websocket-speaker.html
+++ b/tests/audio-websocket-speaker.html
@@ -87,6 +87,7 @@
         }
     });
 
+    // connect to WS
     function speakerConnect() {
         if (speakerIsReconnecting) return;
 
@@ -104,6 +105,8 @@
             speakerCurrentPlayTimeMs = null;
             speakerAudioCtx = null;
             speakerCurrentAudio = null;
+
+            sendRoom();
         };
 
         syncClockWithServer(speakerWs);
@@ -125,10 +128,10 @@
                     speakerStatus(`Streaming audio at ${speakerCurrentSampleRate}Hz`);
 
                     // Re-initialize AudioContext with the correct sample rate if needed
-                    if (speakerAudioCtx) speakerAudioCtx.close();
+                    if (speakerAudioCtx && speakerAudioCtx.sampleRate !== speakerCurrentSampleRate) speakerAudioCtx.close();
 
                     // Only create context if user has clicked, otherwise audio is blocked
-                    if (userHasInteracted) {
+                    if (userHasInteracted && (speakerAudioCtx == null || speakerAudioCtx.state === "closed")) {
                         speakerAudioCtx = new window.AudioContext({
                             sampleRate: speakerCurrentSampleRate
                         });
@@ -171,6 +174,7 @@
         };
     }
 
+    // all variables are filled and audio was received: play it
     function playAudioChunk(arrayBuffer, serverPlaytimeMs) {
         if (!speakerAudioCtx) return;
 
@@ -204,10 +208,12 @@
         // Schedule the chunk to play seamlessly after the previous one
         speakerCurrentAudio.start(audioCtxPlayTime, playOffset);
 
+        // copy localWs to not use (*another* speakerWs instance
+        let localWs = speakerWs;
         // send ACK on end
         speakerCurrentAudio.onended = () => {
-            if (speakerWs && speakerWs.readyState === WebSocket.OPEN) {
-                speakerWs.send("played");
+            if (localWs && localWs.readyState === WebSocket.OPEN) {
+                localWs.send("played");
                 speakerStatus("Current audio played.");
             }
         };
@@ -217,6 +223,7 @@
         speakerCurrentPlayTimeMs = null;
     }
 
+    // set status HTML div, also log to console
     function speakerStatus(status) {
         console.log("Speaker Status changed: ", status);
         speakerStatusEl.innerText = "Speaker Status: " + status;
@@ -270,10 +277,11 @@
             ws.send("sync_ping");
         }
 
+        // helper to fire the initial ping: try the websocket max. 5 times, wait 500ms after each unsuccessful attempt
         function sendPingInit(i) {
             if (ws.readyState === WebSocket.OPEN) {
                 sendPing();
-            } else {
+            } else if (i <= 5) {
                 setTimeout(() => sendPingInit(i + 1), 500);
             }
         }
@@ -282,12 +290,17 @@
         sendPingInit(0);
     }
 
-    roomBtn.addEventListener('click', () => {
-       let room = document.getElementById('room').value;
+    // send the room data if the WS is open
+    function sendRoom() {
+        let room = document.getElementById('room').value;
 
-       if (speakerWs && speakerWs.readyState === WebSocket.OPEN) {
-           speakerWs.send("room:" + room);
-       }
+        if (speakerWs && speakerWs.readyState === WebSocket.OPEN) {
+            speakerWs.send("room:" + room);
+        }
+    }
+
+    roomBtn.addEventListener('click', () => {
+        sendRoom();
     });
 </script>
 </body>

From 672c51f47906ef651285b64787e8468de006a86c Mon Sep 17 00:00:00 2001
From: Elias <reisbauer03@proton.me>
Date: Mon, 30 Mar 2026 20:57:38 +0200
Subject: [PATCH 10/22] Websocket audio: speaker test client HTML: better
 cleanup on socket close

---
 tests/audio-websocket-speaker.html | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/tests/audio-websocket-speaker.html b/tests/audio-websocket-speaker.html
index a2a8688e..ad3315e7 100644
--- a/tests/audio-websocket-speaker.html
+++ b/tests/audio-websocket-speaker.html
@@ -160,6 +160,20 @@
             speakerWs = null;
             speakerIsReconnecting = true;
 
+            if (speakerCurrentAudio) {
+                speakerCurrentAudio.onended = null;
+                try {
+                    speakerCurrentAudio.stop();
+                } catch (_err) {}
+                speakerCurrentAudio = null;
+            }
+            if (speakerAudioCtx) {
+                if (speakerAudioCtx.state !== "closed") {
+                    speakerAudioCtx.close();
+                }
+                speakerAudioCtx = null;
+            }
+
             // Retry loop
             setTimeout(() => {
                 speakerIsReconnecting = false;

From 1111ce5d4435af254ccdca670a69b260ad5083c3 Mon Sep 17 00:00:00 2001
From: Elias <reisbauer03@proton.me>
Date: Mon, 30 Mar 2026 21:06:13 +0200
Subject: [PATCH 11/22] Websocket audio: test client HTML: fix bug that sent
 the room tag even if none was set

---
 tests/audio-websocket-mic.html     | 4 ++--
 tests/audio-websocket-speaker.html | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/audio-websocket-mic.html b/tests/audio-websocket-mic.html
index 68108423..78919fef 100644
--- a/tests/audio-websocket-mic.html
+++ b/tests/audio-websocket-mic.html
@@ -215,11 +215,11 @@
         microphoneStatusEl.innerText = "Mic Status: " + status;
     }
 
-    // send the room data if the WS is open
+    // send the room data if the WS is open and room data is set
     function sendRoom() {
         let room = document.getElementById('room').value;
 
-        if (micWs && micWs.readyState === WebSocket.OPEN) {
+        if (room.trim().length !== 0 && micWs && micWs.readyState === WebSocket.OPEN) {
             micWs.send("room:" + room);
         }
     }
diff --git a/tests/audio-websocket-speaker.html b/tests/audio-websocket-speaker.html
index ad3315e7..3d0c607a 100644
--- a/tests/audio-websocket-speaker.html
+++ b/tests/audio-websocket-speaker.html
@@ -304,11 +304,11 @@
         sendPingInit(0);
     }
 
-    // send the room data if the WS is open
+    // send the room data if the WS is open and room data is set
     function sendRoom() {
         let room = document.getElementById('room').value;
 
-        if (speakerWs && speakerWs.readyState === WebSocket.OPEN) {
+        if (room.trim().length !== 0 && speakerWs && speakerWs.readyState === WebSocket.OPEN) {
             speakerWs.send("room:" + room);
         }
     }

From af5fbcb716decbfe06c1aef8e862291deee1c1e6 Mon Sep 17 00:00:00 2001
From: Elias <reisbauer03@proton.me>
Date: Mon, 30 Mar 2026 21:25:21 +0200
Subject: [PATCH 12/22] Websocket audio: speaker test client HTML: suppress ACK
 on reset

---
 tests/audio-websocket-speaker.html | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/audio-websocket-speaker.html b/tests/audio-websocket-speaker.html
index 3d0c607a..a0f39df0 100644
--- a/tests/audio-websocket-speaker.html
+++ b/tests/audio-websocket-speaker.html
@@ -140,7 +140,9 @@
                     let valueStr = event.data.split(":")[1];
                     speakerCurrentPlayTimeMs = parseFloat(valueStr) * 1000
                 } else if (event.data === "reset" && speakerCurrentAudio) {
+                    speakerCurrentAudio.onended = null;
                     speakerCurrentAudio.stop();
+                    speakerCurrentAudio = null;
                     speakerStatus("Stream reset");
                 }
             }

From 69f6138345cd5bed33c28bcd52eb3bab5977c019 Mon Sep 17 00:00:00 2001
From: Elias <reisbauer03@proton.me>
Date: Wed, 1 Apr 2026 22:20:44 +0200
Subject: [PATCH 13/22] Websocket audio: move docs to appropriate location

---
 README.md                                               | 2 +-
 README_WEBSOCKET_PROTOCOL.md => docs/audio_websocket.md | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename README_WEBSOCKET_PROTOCOL.md => docs/audio_websocket.md (100%)

diff --git a/README.md b/README.md
index 6f4aac8c..2a35c2e9 100644
--- a/README.md
+++ b/README.md
@@ -496,7 +496,7 @@ You can use `tests/audio-websocket-both.html` to speak and hear GLaDOS.
 
 For configuration options, check out `configs/glados_websocket_config.yaml`.
 
-For an exact description of the websocket protocol, see `README_WEBSOCKET_PROTOCOL.md`.
+For an exact description of the websocket protocol, see [docs/audio_websocket.md](./docs/audio_websocket.md).
 
 ## Troubleshooting
 
diff --git a/README_WEBSOCKET_PROTOCOL.md b/docs/audio_websocket.md
similarity index 100%
rename from README_WEBSOCKET_PROTOCOL.md
rename to docs/audio_websocket.md

From 3ef20c3866fa1a9a41a04fed2ea8965e23c679d3 Mon Sep 17 00:00:00 2001
From: Elias <reisbauer03@proton.me>
Date: Wed, 1 Apr 2026 22:30:15 +0200
Subject: [PATCH 14/22] Websocket audio/client POC: add mute button

---
 tests/audio-websocket-mic.html     | 22 +++++++++++++++++++++-
 tests/audio-websocket-speaker.html |  3 ++-
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/tests/audio-websocket-mic.html b/tests/audio-websocket-mic.html
index 78919fef..8c06d735 100644
--- a/tests/audio-websocket-mic.html
+++ b/tests/audio-websocket-mic.html
@@ -31,6 +31,7 @@
             border: none;
             border-radius: 6px;
             transition: background-color 0.2s;
+            margin: 5px;
         }
 
         button:hover {
@@ -43,7 +44,7 @@
         }
 
         input {
-            margin-top: 15px;
+            margin-top: 10px;
             margin-bottom: 5px;
         }
     </style>
@@ -56,12 +57,15 @@
 <input type="text" id="room"/>
 <button id="roomBtn">Set room</button>
 
+<button id="muteBtn" aria-pressed="false">Mute</button>
+
 <script>
     const MIC_WSL_URL = 'ws://localhost:5051/microphone';
     const microphoneStatusEl = document.getElementById('microphoneStatus');
 
     const startBtn = document.getElementById('startBtn');
     const roomBtn = document.getElementById('roomBtn');
+    const muteBtn = document.getElementById('muteBtn');
 
     let micWs = null;
     let micSampleRate = null;
@@ -227,6 +231,22 @@
     roomBtn.addEventListener('click', () => {
         sendRoom();
     });
+
+
+    // toggle microphone mute / unmute
+    muteBtn.addEventListener('click', () => {
+        const isPressed = muteBtn.getAttribute('aria-pressed') === 'true';
+        muteBtn.setAttribute('aria-pressed', (!isPressed).toString());
+        muteBtn.textContent = isPressed ? 'Mute' : 'Unmute';
+
+        // if isPressed = true, we were muted and should now unmute
+        const enabled = isPressed;
+        console.log("Set mic input to " + enabled);
+        if (micStream) {
+            const tracks = micStream.getAudioTracks();
+            tracks.forEach(track => track.enabled = enabled);
+        }
+    })
 </script>
 </body>
 </html>
\ No newline at end of file
diff --git a/tests/audio-websocket-speaker.html b/tests/audio-websocket-speaker.html
index a0f39df0..0ae97c25 100644
--- a/tests/audio-websocket-speaker.html
+++ b/tests/audio-websocket-speaker.html
@@ -31,6 +31,7 @@
             border: none;
             border-radius: 6px;
             transition: background-color 0.2s;
+            margin: 5px;
         }
 
         button:hover {
@@ -43,7 +44,7 @@
         }
 
         input {
-            margin-top: 15px;
+            margin-top: 10px;
             margin-bottom: 5px;
         }
     </style>

From 6914cdc08e1faa8bf204dda34dd014a4ca8910f7 Mon Sep 17 00:00:00 2001
From: Elias <reisbauer03@proton.me>
Date: Thu, 2 Apr 2026 00:35:53 +0200
Subject: [PATCH 15/22] Websocket audio: enforce audio_data data type, fix bug
 in slow_clap tool where audio was read as f64

---
 src/glados/audio_io/sounddevice_io.py | 2 +-
 src/glados/audio_io/websocket_io.py   | 2 +-
 src/glados/tools/slow_clap.py         | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/glados/audio_io/sounddevice_io.py b/src/glados/audio_io/sounddevice_io.py
index 253b8486..72c9f87f 100644
--- a/src/glados/audio_io/sounddevice_io.py
+++ b/src/glados/audio_io/sounddevice_io.py
@@ -130,7 +130,7 @@ def start_speaking(self, audio_data: NDArray[np.float32], sample_rate: int | Non
             RuntimeError: If audio playback cannot be initiated
             ValueError: If audio_data is empty or not a valid numpy array
         """
-        if not isinstance(audio_data, np.ndarray) or audio_data.size == 0:
+        if not isinstance(audio_data, np.ndarray) or audio_data.size == 0 or audio_data.dtype != np.float32:
             raise ValueError("Invalid audio data")
 
         if sample_rate is None:
diff --git a/src/glados/audio_io/websocket_io.py b/src/glados/audio_io/websocket_io.py
index a41861a1..64fb40a7 100644
--- a/src/glados/audio_io/websocket_io.py
+++ b/src/glados/audio_io/websocket_io.py
@@ -149,7 +149,7 @@ def start_speaking(self, audio_data: NDArray[np.float32], sample_rate: int | Non
             text: Optional text associated with the audio (not used by this implementation)
             wait: Optionally wait for the audio_data to be spoken
         """
-        if not isinstance(audio_data, np.ndarray) or audio_data.size == 0:
+        if not isinstance(audio_data, np.ndarray) or audio_data.size == 0 or audio_data.dtype != np.float32:
             raise ValueError("Invalid audio data")
 
         if sample_rate is None:
diff --git a/src/glados/tools/slow_clap.py b/src/glados/tools/slow_clap.py
index eb40ab74..62a154e0 100644
--- a/src/glados/tools/slow_clap.py
+++ b/src/glados/tools/slow_clap.py
@@ -57,7 +57,7 @@ def run(self, tool_call_id: str, call_args: dict[str, Any]) -> None:
             claps = 1
 
         try:
-            data, sample_rate = sf.read(self.audio_path)
+            data, sample_rate = sf.read(self.audio_path, dtype='float32')
 
             for _ in range(claps):
                 self.audio_io.start_speaking(data, sample_rate=sample_rate, wait=True)
@@ -65,7 +65,7 @@ def run(self, tool_call_id: str, call_args: dict[str, Any]) -> None:
                 {
                     "role": "tool",
                     "tool_call_id": tool_call_id,
-                    "content": "success",
+                    "content": "Success. The tool played a slow clap audio to the user. You do not need to narrate the clapping.",
                     "type": "function_call_output",
                 }
             )

From 4950d2d513c64d9f3dd8df4ab8620b1ef8edc91b Mon Sep 17 00:00:00 2001
From: Elias <reisbauer03@proton.me>
Date: Thu, 2 Apr 2026 01:34:43 +0200
Subject: [PATCH 16/22] Websocket audio: fix timing issues and initialization
 exception propagation, slow_clap: lazy default init

---
 src/glados/audio_io/websocket_io.py | 34 +++++++++++++++++++++--------
 src/glados/tools/slow_clap.py       |  4 +++-
 2 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/src/glados/audio_io/websocket_io.py b/src/glados/audio_io/websocket_io.py
index 64fb40a7..65260b9f 100644
--- a/src/glados/audio_io/websocket_io.py
+++ b/src/glados/audio_io/websocket_io.py
@@ -1,4 +1,5 @@
 import asyncio
+import concurrent.futures
 import logging
 import queue
 import threading
@@ -54,8 +55,8 @@ class WebsocketAudioIO:
     PORT: int = 5051  # websockets server port
     SPEAKER_SYNC_DELAY_MS: int = 250  # Milliseconds to add to start time to account for speaker synchronisation
     MIC_MAX_SILENCE_CHUNKS: int = 10  # how many VAD chunks must be silent for a mic to relinquish control
-    DEFAULT_ROOM_TAG: str = "office" # default room tag
-    SEGREGATE_SPEAKERS: bool = False # default value for speaker segregation.
+    DEFAULT_ROOM_TAG: str = "office"  # default room tag
+    SEGREGATE_SPEAKERS: bool = False  # default value for speaker segregation.
 
     def __init__(self, vad_threshold: float | None = None, options: dict[str, Any] | None = None) -> None:
         """Initialize the websocket audio I/O.
@@ -125,8 +126,14 @@ def __init__(self, vad_threshold: float | None = None, options: dict[str, Any] |
         self._mic_state_lock: asyncio.Lock
         self._mic_state = MicState(room=self._default_room_tag)
 
-        self._server_thread = threading.Thread(target=lambda s, p: asyncio.run(self._run_server(s, p)), args=(server, port), daemon=True)
+        startup_future: concurrent.futures.Future[None] = concurrent.futures.Future()
+        self._server_thread = threading.Thread(
+            target=lambda s, p, f: asyncio.run(self._run_server(s, p, f)),
+            args=(server, port, startup_future),
+            daemon=True
+        )
         self._server_thread.start()
+        startup_future.result(timeout=10)
 
     def start_listening(self) -> None:
         """Start capturing audio from the websocket.
@@ -155,10 +162,12 @@ def start_speaking(self, audio_data: NDArray[np.float32], sample_rate: int | Non
         if sample_rate is None:
             sample_rate = self.SAMPLE_RATE
 
-        # Stop any existing playback
-        self.stop_speaking()
+        if self._is_playing:
+            # Stop any existing playback and wait for finish
+            self.stop_speaking()
+            self._playback_finished_event.wait()
 
-        # Playback is not finished
+        # Playback is finished
         self._playback_finished_event.clear()
 
         # Lock, set data, unlock
@@ -209,8 +218,9 @@ def measure_percentage_spoken(self, total_samples: int, sample_rate: int | None
             logger.debug("Playback was interrupted in Server thread")
 
         if not completed:
-            interrupted = True
             logger.debug("Audio playback timed out, forcing interruption")
+            # Assume nothing was played because no speaker was there
+            return True, 0
 
         played_samples = elapsed * sample_rate
         percentage_played = min(int(played_samples * 100 / total_samples), 100)
@@ -243,7 +253,7 @@ def get_sample_queue(self) -> queue.Queue[tuple[NDArray[np.float32], bool]]:
         """
         return self._sample_queue
 
-    async def _run_server(self, server: str, port: int) -> None:
+    async def _run_server(self, server: str, port: int, result_future: concurrent.futures.Future) -> None:
         """Runs the websocket server.
 
         Args:
@@ -266,7 +276,13 @@ def emit(self, record: logging.LogRecord) -> None:
         ws_logger.addHandler(ws_log_handler)
         ws_logger.propagate = False
 
-        server = await websockets.serve(self._server_listen, host=server, port=port)
+        try:
+            server = await websockets.serve(self._server_listen, host=server, port=port)
+            result_future.set_result(None)
+        except OSError as ex:
+            result_future.set_exception(ex)
+            raise
+
         await server.serve_forever()
 
     async def _server_listen(self, websocket: websockets.ServerConnection) -> None:
diff --git a/src/glados/tools/slow_clap.py b/src/glados/tools/slow_clap.py
index 62a154e0..8073c473 100644
--- a/src/glados/tools/slow_clap.py
+++ b/src/glados/tools/slow_clap.py
@@ -40,7 +40,9 @@ def __init__(
         self.llm_queue = llm_queue
         tool_config = tool_config or {}
         self.audio_path = tool_config.get("slow_clap_audio_path", "data/slow-clap.mp3")
-        self.audio_io = tool_config.get("audio_io", get_audio_system())
+        self.audio_io = tool_config.get("audio_io")
+        if self.audio_io is None:
+            self.audio_io = get_audio_system()
 
     def run(self, tool_call_id: str, call_args: dict[str, Any]) -> None:
         """

From 36147fb0aacf8a3867c92f649218bade2d96eb23 Mon Sep 17 00:00:00 2001
From: Elias <reisbauer03@proton.me>
Date: Thu, 2 Apr 2026 01:41:20 +0200
Subject: [PATCH 17/22] Websocket audio: logAdapter: fix wrong format string

---
 src/glados/audio_io/websocket_io.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/glados/audio_io/websocket_io.py b/src/glados/audio_io/websocket_io.py
index 65260b9f..4f28982f 100644
--- a/src/glados/audio_io/websocket_io.py
+++ b/src/glados/audio_io/websocket_io.py
@@ -270,7 +270,7 @@ def emit(self, record: logging.LogRecord) -> None:
                 getattr(logger, level)(msg)
 
         ws_log_handler = LogAdapter()
-        ws_log_handler.setFormatter(logging.Formatter("[%(asctime)s] $(name)s %(message)s"))
+        ws_log_handler.setFormatter(logging.Formatter("[%(asctime)s] %(name)s %(message)s"))
 
         ws_logger = logging.getLogger("websockets")
         ws_logger.addHandler(ws_log_handler)

From b37c7efed9cc83e60e9e1e3c75a55e875f309ea5 Mon Sep 17 00:00:00 2001
From: Elias <reisbauer03@proton.me>
Date: Thu, 2 Apr 2026 01:58:36 +0200
Subject: [PATCH 18/22] Websocket audio: segregate_speakers: fix bool parsing

---
 src/glados/audio_io/websocket_io.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/glados/audio_io/websocket_io.py b/src/glados/audio_io/websocket_io.py
index 4f28982f..6cff4e1f 100644
--- a/src/glados/audio_io/websocket_io.py
+++ b/src/glados/audio_io/websocket_io.py
@@ -102,7 +102,10 @@ def __init__(self, vad_threshold: float | None = None, options: dict[str, Any] |
                     case "default_room_tag":
                         self._default_room_tag = str(val)
                     case "segregate_speakers":
-                        self._segregate_speakers = bool(val)
+                        if isinstance(val, bool):
+                            self._segregate_speakers = val
+                        else:
+                            raise ValueError("segregate_speakers must be a boolean value")
                     case _:
                         raise ValueError(f"Websocket backend: unsupported option '{key}'")
 

From 57c5e137fe4a7f1aefc556ed97c4dba82c59c9bf Mon Sep 17 00:00:00 2001
From: Elias <reisbauer03@proton.me>
Date: Thu, 2 Apr 2026 17:44:29 +0200
Subject: [PATCH 19/22] Websocket audio: fix theoretical race condition there
 multiple speakers could set exit flags multiple times

---
 src/glados/audio_io/websocket_io.py | 61 +++++++++++++++++++----------
 1 file changed, 41 insertions(+), 20 deletions(-)

diff --git a/src/glados/audio_io/websocket_io.py b/src/glados/audio_io/websocket_io.py
index 6cff4e1f..6d04e53f 100644
--- a/src/glados/audio_io/websocket_io.py
+++ b/src/glados/audio_io/websocket_io.py
@@ -24,6 +24,7 @@ class AudioData:
     data: NDArray[np.float32]
     sample_rate: int
     play_time: float
+    track_id: uuid.UUID | None
 
 
 @dataclass
@@ -177,10 +178,12 @@ def start_speaking(self, audio_data: NDArray[np.float32], sample_rate: int | Non
         with self._audio_lock:
             # allow for network jitter, time to websocket send, etc.
             play_time = time.time() + (self._speaker_sync_delay_ms / 1000)
-            self._audio_data = AudioData(audio_data, sample_rate, play_time)
+            self._audio_data = AudioData(audio_data, sample_rate, play_time, uuid.uuid4())
 
+        # set state
         self._stop_playback = False
         self._is_playing = True
+        self._playback_was_interrupted = False
 
         logger.debug("Scheduled audio playback")
 
@@ -207,8 +210,6 @@ def measure_percentage_spoken(self, total_samples: int, sample_rate: int | None
         if sample_rate is None:
             sample_rate = self.SAMPLE_RATE
 
-        self._playback_was_interrupted = False
-
         # wait for finish
         max_timeout = (total_samples / sample_rate) + (self._speaker_sync_delay_ms / 1000.0) + 1.0
 
@@ -323,6 +324,26 @@ async def handle_default_msg(ws_msg: str | bytes) -> bool:
                 return False
             return True
 
+        def set_flags_once(track_id: uuid.UUID, was_interrupted: bool) -> None:
+            """
+            Set flags that audio was played if the given track_id matches the currently stored track_id.
+            If flags are set, the track_id is cleared from self._audio_data.
+            This ensures that the flags are only set by 1 speaker task.
+
+            Args:
+                track_id: ID of the audio track
+                was_interrupted: If the audio was interrupted (as interpreted by this task).
+            """
+            assert track_id is not None
+
+            with self._audio_lock:
+                if self._audio_data.track_id == track_id:
+                    self._playback_was_interrupted = was_interrupted
+                    self._is_playing = False
+                    self._playback_finished_event.set()
+                    # ensure that this is only called once
+                    self._audio_data.track_id = None
+
         while True:
             # 1. IDLE LOOP: Check for play state, but listen for sync pings in the meantime!
             while not self._is_playing:
@@ -353,13 +374,18 @@ async def handle_default_msg(ws_msg: str | bytes) -> bool:
 
             # 2. AUDIO SEND PHASE
             # We acquire the lock just long enough to grab the data safely.
-            try:
-                with self._audio_lock:
-                    play_time = self._audio_data.play_time
-                    sample_rate = self._audio_data.sample_rate
-                    audio_data_bytes = self._audio_data.data.tobytes()
-                    sample_count = len(self._audio_data.data)
+            with self._audio_lock:
+                play_time = self._audio_data.play_time
+                sample_rate = self._audio_data.sample_rate
+                audio_data_bytes = self._audio_data.data.tobytes()
+                sample_count = len(self._audio_data.data)
+                current_track_id = self._audio_data.track_id
 
+            # Audio with no track ID should not be played
+            if current_track_id is None:
+                continue
+
+            try:
                 # Send timestamp, then sample rate, then bytes
                 await websocket.send("time:" + str(play_time))
                 await websocket.send("sampleRate:" + str(sample_rate))
@@ -367,10 +393,8 @@ async def handle_default_msg(ws_msg: str | bytes) -> bool:
 
                 logger.debug(f"Playing audio with sample rate: {sample_rate} Hz, length: {sample_count} samples")
             except websockets.exceptions.ConnectionClosed:
-                self._playback_was_interrupted = True
-                self._is_playing = False
-                self._playback_finished_event.set()
-                break
+                set_flags_once(current_track_id, True)
+                return
 
             # 3. WAITING PHASE
             while not self._stop_playback:
@@ -378,21 +402,18 @@ async def handle_default_msg(ws_msg: str | bytes) -> bool:
                     message = await asyncio.wait_for(websocket.recv(), timeout=0.05)
                     if await handle_default_msg(message) and message == "played":
                         logger.debug("Websocket: Audio played fully")
-                        self._playback_was_interrupted = False
+                        set_flags_once(current_track_id, False)
                         break
                 except asyncio.TimeoutError:
                     continue
                 except websockets.exceptions.ConnectionClosed:
-                    self._playback_was_interrupted = True
-                    break
+                    set_flags_once(current_track_id, True)
+                    return
             else:
                 # self._stop_playback is true
-                self._playback_was_interrupted = True
                 await websocket.send("reset")
                 logger.debug("Sent audio reset")
-
-            self._is_playing = False
-            self._playback_finished_event.set()
+                set_flags_once(current_track_id, True)
 
     async def _server_microphone(self, websocket: websockets.ServerConnection) -> None:
         """

From 0e4a229d4911f3be37b44ffe1db6d41aa96e42d9 Mon Sep 17 00:00:00 2001
From: Elias <reisbauer03@proton.me>
Date: Thu, 2 Apr 2026 18:12:26 +0200
Subject: [PATCH 20/22] Websocket audio: Speaker: copy audio data into Lock
 instead of reference exception handling on reset Microphone: always process
 as much audio as possible

---
 src/glados/audio_io/websocket_io.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/src/glados/audio_io/websocket_io.py b/src/glados/audio_io/websocket_io.py
index 6d04e53f..4182d399 100644
--- a/src/glados/audio_io/websocket_io.py
+++ b/src/glados/audio_io/websocket_io.py
@@ -178,7 +178,7 @@ def start_speaking(self, audio_data: NDArray[np.float32], sample_rate: int | Non
         with self._audio_lock:
             # allow for network jitter, time to websocket send, etc.
             play_time = time.time() + (self._speaker_sync_delay_ms / 1000)
-            self._audio_data = AudioData(audio_data, sample_rate, play_time, uuid.uuid4())
+            self._audio_data = AudioData(np.copy(audio_data), sample_rate, play_time, uuid.uuid4())
 
         # set state
         self._stop_playback = False
@@ -411,9 +411,13 @@ def set_flags_once(track_id: uuid.UUID, was_interrupted: bool) -> None:
                     return
             else:
                 # self._stop_playback is true
-                await websocket.send("reset")
-                logger.debug("Sent audio reset")
-                set_flags_once(current_track_id, True)
+                try:
+                    await websocket.send("reset")
+                    logger.debug("Sent audio reset")
+                except websockets.exceptions.ConnectionClosed:
+                    logger.debug("Speaker disconnected before reset could be sent")
+                finally:
+                    set_flags_once(current_track_id, True)
 
     async def _server_microphone(self, websocket: websockets.ServerConnection) -> None:
         """
@@ -458,8 +462,8 @@ async def relinquish():
                 data = np.frombuffer(msg, dtype=np.float32)
                 current_data = np.append(current_data, data)
 
-                # if enough current data is there, run it through the VAD
-                if len(current_data) >= vad_needed_samples:
+                # process every complete VAD window stored
+                while len(current_data) >= vad_needed_samples:
                     # get data for VAD
                     vad_data = current_data[:vad_needed_samples]
                     # extra data stays for next VAD

From 83f82585c2c32568afe499bea44f492106605ead Mon Sep 17 00:00:00 2001
From: Elias <reisbauer03@proton.me>
Date: Thu, 2 Apr 2026 18:31:05 +0200
Subject: [PATCH 21/22] Websocket audio: Speaker: avoid hanging if no speakers
 are connected (CodeRabbit)

---
 src/glados/audio_io/websocket_io.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/glados/audio_io/websocket_io.py b/src/glados/audio_io/websocket_io.py
index 4182d399..fe68fb03 100644
--- a/src/glados/audio_io/websocket_io.py
+++ b/src/glados/audio_io/websocket_io.py
@@ -169,7 +169,7 @@ def start_speaking(self, audio_data: NDArray[np.float32], sample_rate: int | Non
         if self._is_playing:
             # Stop any existing playback and wait for finish
             self.stop_speaking()
-            self._playback_finished_event.wait()
+            self._playback_finished_event.wait(timeout=2.0)
 
         # Playback is finished
         self._playback_finished_event.clear()

From c64bafee643a8974a2bea2d3663af0981b974d5c Mon Sep 17 00:00:00 2001
From: Elias <reisbauer03@proton.me>
Date: Mon, 6 Apr 2026 20:57:28 +0200
Subject: [PATCH 22/22] Websocket audio: Compatibility with PR for multiple
 configuration files

---
 src/glados/cli.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/glados/cli.py b/src/glados/cli.py
index 79dc942a..e589fb4f 100644
--- a/src/glados/cli.py
+++ b/src/glados/cli.py
@@ -196,7 +196,7 @@ def say(text: str, config_path: str | Path = "glados_config.yaml") -> None:
     # Generate the audio to from the text
     audio = glados_tts.generate_speech_audio(converted_text)
 
-    glados_config = GladosConfig.from_yaml(str(config_path))
+    glados_config = GladosConfig.from_yaml(config_path)
     audio_system = get_audio_system(backend_type=glados_config.audio_io, backend_options=glados_config.audio_io_options)
 
     # Play the audio