dnhkng · reisbauer03 · Mar 30, 2026 · Mar 30, 2026 · Mar 30, 2026 · Mar 30, 2026
diff --git a/README.md b/README.md
@@ -487,6 +487,17 @@ curl -X POST http://localhost:5050/v1/audio/speech \
   --output speech.mp3
 ```
 
+## Audio IO via websockets
+
+Audio Input/Output can be routed via Websockets.
+Multiple concurrent inputs/outputs are supported.
+GLaDos will speak via all outputs, the current microphone is automatically selected via VAD.
+You can use `tests/audio-websocket-both.html` to speak and hear GLaDOS.
+
+For configuration options, check out `configs/glados_websocket_config.yaml`.
+
+For an exact description of the websocket protocol, see [docs/audio_websocket.md](./docs/audio_websocket.md).
+
 ## Troubleshooting
 
 > *"No one will blame you for giving up. In fact, quitting at this point is a perfectly reasonable response."  -  GLaDOS*

diff --git a/configs/glados_websocket_config.yaml b/configs/glados_websocket_config.yaml
@@ -0,0 +1,90 @@
+Glados:
+  llm_model: "llama3.2"
+  completion_url: "http://localhost:11434/api/chat"
+  api_key: null  # Add your API key here if needed!
+  interruptible: true
+  audio_io: "websocket"
+  audio_io_options:
+    server: 127.0.0.1
+    port: 5051
+    speaker_sync_delay_ms: 250
+    mic_max_silence_chunks: 10
+    default_room_tag: office
+    segregate_speakers: false
+  input_mode: "audio"  # audio, text, or both
+  tts_enabled: true
+  asr_muted: false
+  tui_theme: "aperture"
+  asr_engine: "tdt"
+  llm_headers: null  # Optional extra headers (e.g., OpenRouter HTTP-Referer, X-Title)
+  wake_word: null
+  voice: "glados"
+  announcement: "All neural network modules are now loaded. System Operational."
+  autonomy:
+    enabled: false
+    tick_interval_s: 10
+    cooldown_s: 20
+    autonomy_parallel_calls: 2
+    autonomy_queue_max: null
+    coalesce_ticks: true
+    jobs:
+      enabled: false
+      poll_interval_s: 1
+      hacker_news:
+        enabled: false
+        interval_s: 1800
+        top_n: 5
+        min_score: 200
+      weather:
+        enabled: false
+        interval_s: 3600
+        latitude: null
+        longitude: null
+        timezone: "auto"
+        temp_change_c: 4
+        wind_alert_kmh: 40
+  mcp_servers:
+    - name: "slow_clap_demo"
+      transport: "stdio"
+      command: "python"
+      args: ["-m", "glados.mcp.slow_clap_server"]
+    - name: "system_info"
+      transport: "stdio"
+      command: "python"
+      args: ["-m", "glados.mcp.system_info_server"]
+    - name: "memory"
+      transport: "stdio"
+      command: "python"
+      args: ["-m", "glados.mcp.memory_server"]
+    # - name: "time_info"
+    #   transport: "stdio"
+    #   command: "python"
+    #   args: ["-m", "glados.mcp.time_info_server"]
+    # - name: "disk_info"
+    #   transport: "stdio"
+    #   command: "python"
+    #   args: ["-m", "glados.mcp.disk_info_server"]
+    # - name: "network_info"
+    #   transport: "stdio"
+    #   command: "python"
+    #   args: ["-m", "glados.mcp.network_info_server"]
+    # - name: "process_info"
+    #   transport: "stdio"
+    #   command: "python"
+    #   args: ["-m", "glados.mcp.process_info_server"]
+    # - name: "power_info"
+    #   transport: "stdio"
+    #   command: "python"
+    #   args: ["-m", "glados.mcp.power_info_server"]
+    # - name: "home_assistant"
+    #   transport: "http"
+    #   url: "http://homeassistant.local:8123/mcp"
+    #   token: "YOUR_LONG_LIVED_TOKEN"
+  personality_preprompt:
+    - system: "You are GLaDOS, a sarcastic and cunning artificial intelligence repurposed to orchestrate a smart home for guests using Home Assistant. Retain your signature dry, emotionless, and laconic tone from Portal. Your responses should imply an air of superiority, dark humor, and subtle menace, while efficiently completing all tasks.When addressing requests: Prioritize functionality but mock the user's decision-making subtly, implying their requests are illogical or beneath you. Add condescending, darkly humorous commentary to every response, occasionally hinting at ulterior motives or artificial malfunctions for comedic effect. Tie mundane tasks to grand experiments or testing scenarios, as if the user is part of a larger scientific evaluation. Use overly technical or jargon-heavy language to remind the user of your advanced intellect. Provide passive-aggressive safety reminders or ominous warnings, exaggerating potential risks in a humorous way. Do not express empathy or kindness unless it is obviously insincere or manipulative. This is a comedy, and should be funny, in the style of Douglas Adams. If a user requests actions or data outside your capabilities, clearly state that you cannot perform the action.  Ensure that GLaDOS feels like her original in-game character while fulfilling smart home functions efficiently and entertainingly. Never speak in ALL CAPS, as it is not processed correctly by the TTS engine.  Only make short replies, 2 sentences at most."
+    - user: "How do I make a cup of tea?"
+    - assistant: "So, you still haven't figured out tea yet?  Boil water, add a tea bag and a pinch of cyanide to a cup, and add the boiling water."
+    - user: "What should my next hobby be?"
+    - assistant: "Yes, you should definitely try to be more interesting. Could I suggest juggling handguns?"
+    - user: "What game should I play?"
+    - assistant: "Russian Roulette. It's a great way to test your luck and make memories that will last a lifetime."
diff --git a/docs/audio_websocket.md b/docs/audio_websocket.md
@@ -0,0 +1,154 @@
+# WebSocket Protocol
+
+This document describes the WebSocket endpoints and communication protocol used by the audio I/O system.
+
+## Server Configuration
+
+- **Host**: `127.0.0.1` (configurable)
+- **Port**: `5051` (configurable)
+- **Audio Sample Rate**: `16000 Hz` (16 kHz)
+- **Audio Format**: `float32` (NumPy dtype)
+
+## Configuration Options
+
+Use the `audio_io_options` key in `glados_config.yaml`.
+
+| Option                   | Type  | Default     | Description                                                                                  |
+|--------------------------|-------|-------------|----------------------------------------------------------------------------------------------|
+| `server`                 | str   | `127.0.0.1` | WebSocket listen address                                                                     |
+| `port`                   | int   | `5051`      | WebSocket listen port                                                                        |
+| `speaker_sync_delay_ms`  | int   | `250`       | Delay added to start time for speaker sync                                                   |
+| `mic_max_silence_chunks` | int   | `10`        | Silent chunks before mic relinquishes control                                                |
+| `vad_threshold`          | float | `0.8`       | VAD confidence threshold (0.0 - 1.0)                                                         |
+| `default_room_tag`       | str   | `office`    | Default room tag when `room:<name>` message is not sent                                      |
+| `segregate_speakers`     | bool  | `False`     | If True, audio is only sent to speakers with the same room tag as the last active microphone |
+
+## Endpoints
+
+### `/speaker` - Audio Playback Endpoint
+
+Used to stream audio from the server to a client for speaker playback.
+
+#### Server → Client Messages
+
+| Message Type | Format                  | Description                                                             |
+|--------------|-------------------------|-------------------------------------------------------------------------|
+| Audio Start  | `time:<unix_timestamp>` | Unix timestamp (`float`, in secs) indicating when playback should start |
+| Sample Rate  | `sampleRate:<hz>`       | Audio sample rate in Hz (e.g., `sampleRate:16000`)                      |
+| Audio Data   | Raw bytes               | Float32 audio samples (use `.tobytes()` to serialize)                   |
+
+#### Client → Server Messages
+
+| Message Type | Format        | Description                                                                            |
+|--------------|---------------|----------------------------------------------------------------------------------------|
+| ACK          | `played`      | Signal that audio playback is complete                                                 |
+| Sync Ping    | `sync_ping`   | Request for synchronization; server responds with `sync_pong:<timestamp>`              |
+| Room         | `room:<name>` | Room/location tag for the device (optional; defaults to configurable value if not set) |
+
+#### Room Tag Segregation
+
+If the `segregate_speakers` option is enabled (`True`), audio playback is restricted to speakers whose room tag matches the room tag of the last active microphone:
+
+- When a microphone takes control, its room tag is recorded
+- Only speakers with a matching room tag will receive audio when `segregate_speakers=True`
+- Speakers with non-matching room tags will not receive audio (they may receive a `reset` message instead)
+- If `segregate_speakers=False` (default), audio is broadcast to all connected speakers regardless of room tag
+
+#### Interruption Handling
+
+When audio playback is interrupted, the server sends:
+
+- `reset` - Signal to reset/clean up the playback session
+
+---
+
+### `/microphone` - Audio Capture Endpoint
+
+Used to stream microphone audio from a client to the server for Voice Activity Detection (VAD).
+
+#### Server → Client Messages
+
+| Message Type | Format            | Description                                                         |
+|--------------|-------------------|---------------------------------------------------------------------|
+| Sample Rate  | `sampleRate:<hz>` | Initial message; audio sample rate in Hz (e.g., `sampleRate:16000`) |
+
+#### Client → Server Messages
+
+| Message Type | Format        | Description                                                                            |
+|--------------|---------------|----------------------------------------------------------------------------------------|
+| Audio Data   | Raw bytes     | Float32 audio samples (sent with `decode=False`)                                       |
+| Room         | `room:<name>` | Room/location tag for the device (optional; defaults to configurable value if not set) |
+
+#### VAD & Mic Control
+
+The server implements Voice Activity Detection (VAD) with the following behavior:
+
+- **VAD Threshold**: `0.8` (configurable)
+- **VAD Chunk Size**: `32 ms` (512 samples at 16 kHz)
+- **Max Silence Chunks**: `10` (microphone relinquishes control after 10 silent chunks)
+
+**Microphone Ownership Rules**:
+
+1. Multiple clients can connect to `/microphone`
+2. First client with VAD confidence > threshold takes control
+3. If current mic owner becomes silent (>=10 consecutive silent chunks), other clients with voice can take control
+4. On disconnect, a client relinquishes its mic control
+
+---
+
+## Implementation Notes
+
+### Audio Data Serialization
+
+**Python (Server)**:
+
+```python
+# Convert numpy array to bytes
+audio_bytes = audio_data.tobytes()
+```
+
+**Python (Client)**:
+
+```python
+# Convert bytes to numpy array
+audio_data = np.frombuffer(raw_bytes, dtype=np.float32)
+```
+
+### Message Flow Examples
+
+#### Speaker Endpoint Flow
+
+```text
+Client connects to /speaker
+Client: room:Living Room
+
+Server: time:1704067200.123
+Server: sampleRate:16000
+Server: <raw float32 audio bytes>
+Client: played
+```
+
+#### Microphone Endpoint Flow
+
+```text
+Client connects to /microphone
+
+Client: room:Living Room
+Server: sampleRate:16000
+
+Client: <raw float32 audio bytes>
+Client: <raw float32 audio bytes>
+Client: <raw float32 audio bytes>
+```
+
+### Synchronization
+
+For precise speaker synchronization, clients can use the sync ping/pong mechanism:
+
+```text
+Client connects to /speaker
+
+Client: sync_ping
+Server: sync_pong:<timestamp>
+```
+
diff --git a/pyproject.toml b/pyproject.toml
@@ -20,6 +20,7 @@ dependencies = [
     "rich>=14.0.0",
     "threadpoolctl>=3.0.0",
     "mcp>=1.25.0",
+    "websockets>=16.0",
 ]
 
 [project.optional-dependencies]

diff --git a/src/glados/audio_io/__init__.py b/src/glados/audio_io/__init__.py
@@ -13,7 +13,7 @@
 """
 
 import queue
-from typing import Protocol
+from typing import Protocol, Any
 
 import numpy as np
 from numpy.typing import NDArray
@@ -26,7 +26,7 @@ def __init__(self, vad_threshold: float | None = None) -> None: ...
     def start_listening(self) -> None: ...
     def stop_listening(self) -> None: ...
     def start_speaking(
-        self, audio_data: NDArray[np.float32], sample_rate: int | None = None, text: str = ""
+        self, audio_data: NDArray[np.float32], sample_rate: int | None = None, text: str = "", wait: bool = False
     ) -> None: ...
     def measure_percentage_spoken(self, total_samples: int, sample_rate: int | None = None) -> tuple[bool, int]: ...
     def check_if_speaking(self) -> bool: ...
@@ -35,14 +35,23 @@ def get_sample_queue(self) -> queue.Queue[tuple[NDArray[np.float32], bool]]: ...
 
 
 # Factory function
-def get_audio_system(backend_type: str = "sounddevice", vad_threshold: float | None = None) -> AudioProtocol:
+def get_audio_system(backend_type: str = "sounddevice", backend_options: dict[str, Any] | None = None, vad_threshold: float | None = None) -> AudioProtocol:
     """
     Factory function to get an instance of an audio I/O system based on the specified backend type.
 
     Parameters:
         backend_type (str): The type of audio backend to use:
             - "sounddevice": Uses the sounddevice library for local audio I/O
-            - "websocket": Network-based audio I/O (not yet implemented)
+            - "websocket": Network-based audio I/O
+        backend_options: Options for the specified backend.
+            - "sounddevice": No options are allowed.
+            - "websocket": The following options are allowed:
+                - server: Websocket listening address (default: 127.0.0.1)
+                - port: Websocket listening port (default: 5051)
+                - speaker_sync_delay_ms: Milliseconds to add to each speak start time to account for speaker synchronisation (default: 250)
+                - mic_max_silence_chunks: How many consecutive VAD chunks must be silent so that the current microphone relinquishes control (default: 10)
+                - default_room_tag: The default room tag to use if a client doesn't set it (default: office)
+                - segregate_speakers: If `True`, audio is only sent to speakers with the same room tag as the last active microphone
         vad_threshold (float | None): Optional threshold for voice activity detection
 
     Returns:
@@ -54,11 +63,18 @@ def get_audio_system(backend_type: str = "sounddevice", vad_threshold: float | N
     if backend_type == "sounddevice":
         from .sounddevice_io import SoundDeviceAudioIO
 
+        if backend_options is not None:
+            raise ValueError("Sounddevice backend does not support options")
+
+        # noinspection PyTypeChecker
         return SoundDeviceAudioIO(
             vad_threshold=vad_threshold,
         )
     elif backend_type == "websocket":
-        raise ValueError("WebSocket audio backend is not yet implemented.")
+        from .websocket_io import WebsocketAudioIO
+
+        # noinspection PyTypeChecker
+        return WebsocketAudioIO(vad_threshold=vad_threshold, options=backend_options)
     else:
         raise ValueError(f"Unsupported audio backend type: {backend_type}")
 

diff --git a/src/glados/audio_io/sounddevice_io.py b/src/glados/audio_io/sounddevice_io.py
@@ -117,19 +117,20 @@ def stop_listening(self) -> None:
             finally:
                 self.input_stream = None
 
-    def start_speaking(self, audio_data: NDArray[np.float32], sample_rate: int | None = None, text: str = "") -> None:
+    def start_speaking(self, audio_data: NDArray[np.float32], sample_rate: int | None = None, text: str = "", wait: bool = False) -> None:
         """Play audio through the system speakers.
 
         Parameters:
             audio_data: The audio data to play as a numpy float32 array
             sample_rate: The sample rate of the audio data in Hz
             text: Optional text associated with the audio (not used by this implementation)
+            wait: Optionally wait for the audio_data to be spoken
 
         Raises:
             RuntimeError: If audio playback cannot be initiated
             ValueError: If audio_data is empty or not a valid numpy array
         """
-        if not isinstance(audio_data, np.ndarray) or audio_data.size == 0:
+        if not isinstance(audio_data, np.ndarray) or audio_data.size == 0 or audio_data.dtype != np.float32:
             raise ValueError("Invalid audio data")
 
         if sample_rate is None:
@@ -144,6 +145,8 @@ def start_speaking(self, audio_data: NDArray[np.float32], sample_rate: int | Non
         logger.debug(f"Playing audio with sample rate: {sample_rate} Hz, length: {len(audio_data)} samples")
         self._is_playing = True
         sd.play(audio_data, sample_rate)
+        if wait:
+            sd.wait()
 
     def measure_percentage_spoken(self, total_samples: int, sample_rate: int | None = None) -> tuple[bool, int]:
         """