Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
706d6de
Contain use of the `sounddevice` lib to `src/glados/audio_io` - all o…
reisbauer03 Mar 30, 2026
e747e23
Add Websocket Audio IO implementation
reisbauer03 Mar 30, 2026
1ce7e95
Websocket Audio: add HTML files for testing/showcasing
reisbauer03 Mar 30, 2026
ac0b0ee
Websocket Audio: write documentation
reisbauer03 Mar 30, 2026
3b82dd7
Websocket Audio: Support for room segregation: only play audios to sp…
reisbauer03 Mar 30, 2026
421fc74
Websocket Audio: HTML test file: better browser compatibility
reisbauer03 Mar 30, 2026
c1872a0
Websocket Audio: Fix issues raised by code rabbit.
reisbauer03 Mar 30, 2026
d169799
Websocket audio: secure server default, fix docs, timeout if waiting,…
reisbauer03 Mar 30, 2026
92c6b0b
Websocket audio: cleanup test HTML (review by coderabbit)
reisbauer03 Mar 30, 2026
672c51f
Websocket audio: speaker test client HTML: better cleanup on socket c…
reisbauer03 Mar 30, 2026
1111ce5
Websocket audio: test client HTML: fix bug that sent the room tag eve…
reisbauer03 Mar 30, 2026
af5fbcb
Websocket audio: speaker test client HTML: suppress ACK on reset
reisbauer03 Mar 30, 2026
69f6138
Websocket audio: move docs to appropriate location
reisbauer03 Apr 1, 2026
3ef20c3
Websocket audio/client POC: add mute button
reisbauer03 Apr 1, 2026
6914cdc
Websocket audio: enforce audio_data data type, fix bug in slow_clap t…
reisbauer03 Apr 1, 2026
4950d2d
Websocket audio: fix timing issues and initialization exception propa…
reisbauer03 Apr 1, 2026
36147fb
Websocket audio: logAdapter: fix wrong format string
reisbauer03 Apr 1, 2026
b37c7ef
Websocket audio: segregate_speakers: fix bool parsing
reisbauer03 Apr 1, 2026
57c5e13
Websocket audio: fix theoretical race condition there multiple speake…
reisbauer03 Apr 2, 2026
0e4a229
Websocket audio:
reisbauer03 Apr 2, 2026
83f8258
Websocket audio: Speaker: avoid hanging if no speakers are connected …
reisbauer03 Apr 2, 2026
c64bafe
Websocket audio: Compatibility with PR for multiple configuration files
reisbauer03 Apr 6, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -487,6 +487,17 @@ curl -X POST http://localhost:5050/v1/audio/speech \
--output speech.mp3
```

## Audio IO via websockets

Audio Input/Output can be routed via Websockets.
Multiple concurrent inputs/outputs are supported.
GLaDos will speak via all outputs, the current microphone is automatically selected via VAD.
You can use `tests/audio-websocket-both.html` to speak and hear GLaDOS.

For configuration options, check out `configs/glados_websocket_config.yaml`.

For an exact description of the websocket protocol, see [docs/audio_websocket.md](./docs/audio_websocket.md).

## Troubleshooting

> *"No one will blame you for giving up. In fact, quitting at this point is a perfectly reasonable response." - GLaDOS*
Expand Down
90 changes: 90 additions & 0 deletions configs/glados_websocket_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
Glados:
llm_model: "llama3.2"
completion_url: "http://localhost:11434/api/chat"
api_key: null # Add your API key here if needed!
interruptible: true
audio_io: "websocket"
audio_io_options:
server: 127.0.0.1
port: 5051
speaker_sync_delay_ms: 250
mic_max_silence_chunks: 10
default_room_tag: office
segregate_speakers: false
input_mode: "audio" # audio, text, or both
tts_enabled: true
asr_muted: false
tui_theme: "aperture"
asr_engine: "tdt"
llm_headers: null # Optional extra headers (e.g., OpenRouter HTTP-Referer, X-Title)
wake_word: null
voice: "glados"
announcement: "All neural network modules are now loaded. System Operational."
autonomy:
enabled: false
tick_interval_s: 10
cooldown_s: 20
autonomy_parallel_calls: 2
autonomy_queue_max: null
coalesce_ticks: true
jobs:
enabled: false
poll_interval_s: 1
hacker_news:
enabled: false
interval_s: 1800
top_n: 5
min_score: 200
weather:
enabled: false
interval_s: 3600
latitude: null
longitude: null
timezone: "auto"
temp_change_c: 4
wind_alert_kmh: 40
mcp_servers:
- name: "slow_clap_demo"
transport: "stdio"
command: "python"
args: ["-m", "glados.mcp.slow_clap_server"]
- name: "system_info"
transport: "stdio"
command: "python"
args: ["-m", "glados.mcp.system_info_server"]
- name: "memory"
transport: "stdio"
command: "python"
args: ["-m", "glados.mcp.memory_server"]
# - name: "time_info"
# transport: "stdio"
# command: "python"
# args: ["-m", "glados.mcp.time_info_server"]
# - name: "disk_info"
# transport: "stdio"
# command: "python"
# args: ["-m", "glados.mcp.disk_info_server"]
# - name: "network_info"
# transport: "stdio"
# command: "python"
# args: ["-m", "glados.mcp.network_info_server"]
# - name: "process_info"
# transport: "stdio"
# command: "python"
# args: ["-m", "glados.mcp.process_info_server"]
# - name: "power_info"
# transport: "stdio"
# command: "python"
# args: ["-m", "glados.mcp.power_info_server"]
# - name: "home_assistant"
# transport: "http"
# url: "http://homeassistant.local:8123/mcp"
# token: "YOUR_LONG_LIVED_TOKEN"
personality_preprompt:
- system: "You are GLaDOS, a sarcastic and cunning artificial intelligence repurposed to orchestrate a smart home for guests using Home Assistant. Retain your signature dry, emotionless, and laconic tone from Portal. Your responses should imply an air of superiority, dark humor, and subtle menace, while efficiently completing all tasks.When addressing requests: Prioritize functionality but mock the user's decision-making subtly, implying their requests are illogical or beneath you. Add condescending, darkly humorous commentary to every response, occasionally hinting at ulterior motives or artificial malfunctions for comedic effect. Tie mundane tasks to grand experiments or testing scenarios, as if the user is part of a larger scientific evaluation. Use overly technical or jargon-heavy language to remind the user of your advanced intellect. Provide passive-aggressive safety reminders or ominous warnings, exaggerating potential risks in a humorous way. Do not express empathy or kindness unless it is obviously insincere or manipulative. This is a comedy, and should be funny, in the style of Douglas Adams. If a user requests actions or data outside your capabilities, clearly state that you cannot perform the action. Ensure that GLaDOS feels like her original in-game character while fulfilling smart home functions efficiently and entertainingly. Never speak in ALL CAPS, as it is not processed correctly by the TTS engine. Only make short replies, 2 sentences at most."
- user: "How do I make a cup of tea?"
- assistant: "So, you still haven't figured out tea yet? Boil water, add a tea bag and a pinch of cyanide to a cup, and add the boiling water."
- user: "What should my next hobby be?"
- assistant: "Yes, you should definitely try to be more interesting. Could I suggest juggling handguns?"
- user: "What game should I play?"
- assistant: "Russian Roulette. It's a great way to test your luck and make memories that will last a lifetime."
154 changes: 154 additions & 0 deletions docs/audio_websocket.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
# WebSocket Protocol

This document describes the WebSocket endpoints and communication protocol used by the audio I/O system.

## Server Configuration

- **Host**: `127.0.0.1` (configurable)
- **Port**: `5051` (configurable)
- **Audio Sample Rate**: `16000 Hz` (16 kHz)
- **Audio Format**: `float32` (NumPy dtype)

## Configuration Options

Use the `audio_io_options` key in `glados_config.yaml`.

| Option | Type | Default | Description |
|--------------------------|-------|-------------|----------------------------------------------------------------------------------------------|
| `server` | str | `127.0.0.1` | WebSocket listen address |
| `port` | int | `5051` | WebSocket listen port |
| `speaker_sync_delay_ms` | int | `250` | Delay added to start time for speaker sync |
| `mic_max_silence_chunks` | int | `10` | Silent chunks before mic relinquishes control |
| `vad_threshold` | float | `0.8` | VAD confidence threshold (0.0 - 1.0) |
| `default_room_tag` | str | `office` | Default room tag when `room:<name>` message is not sent |
| `segregate_speakers` | bool | `False` | If True, audio is only sent to speakers with the same room tag as the last active microphone |

## Endpoints

### `/speaker` - Audio Playback Endpoint

Used to stream audio from the server to a client for speaker playback.

#### Server → Client Messages

| Message Type | Format | Description |
|--------------|-------------------------|-------------------------------------------------------------------------|
| Audio Start | `time:<unix_timestamp>` | Unix timestamp (`float`, in secs) indicating when playback should start |
| Sample Rate | `sampleRate:<hz>` | Audio sample rate in Hz (e.g., `sampleRate:16000`) |
| Audio Data | Raw bytes | Float32 audio samples (use `.tobytes()` to serialize) |

#### Client → Server Messages

| Message Type | Format | Description |
|--------------|---------------|----------------------------------------------------------------------------------------|
| ACK | `played` | Signal that audio playback is complete |
| Sync Ping | `sync_ping` | Request for synchronization; server responds with `sync_pong:<timestamp>` |
| Room | `room:<name>` | Room/location tag for the device (optional; defaults to configurable value if not set) |

#### Room Tag Segregation

If the `segregate_speakers` option is enabled (`True`), audio playback is restricted to speakers whose room tag matches the room tag of the last active microphone:

- When a microphone takes control, its room tag is recorded
- Only speakers with a matching room tag will receive audio when `segregate_speakers=True`
- Speakers with non-matching room tags will not receive audio (they may receive a `reset` message instead)
- If `segregate_speakers=False` (default), audio is broadcast to all connected speakers regardless of room tag

#### Interruption Handling

When audio playback is interrupted, the server sends:

- `reset` - Signal to reset/clean up the playback session

---

### `/microphone` - Audio Capture Endpoint

Used to stream microphone audio from a client to the server for Voice Activity Detection (VAD).

#### Server → Client Messages

| Message Type | Format | Description |
|--------------|-------------------|---------------------------------------------------------------------|
| Sample Rate | `sampleRate:<hz>` | Initial message; audio sample rate in Hz (e.g., `sampleRate:16000`) |

#### Client → Server Messages

| Message Type | Format | Description |
|--------------|---------------|----------------------------------------------------------------------------------------|
| Audio Data | Raw bytes | Float32 audio samples (sent with `decode=False`) |
| Room | `room:<name>` | Room/location tag for the device (optional; defaults to configurable value if not set) |

#### VAD & Mic Control

The server implements Voice Activity Detection (VAD) with the following behavior:

- **VAD Threshold**: `0.8` (configurable)
- **VAD Chunk Size**: `32 ms` (512 samples at 16 kHz)
- **Max Silence Chunks**: `10` (microphone relinquishes control after 10 silent chunks)

**Microphone Ownership Rules**:

1. Multiple clients can connect to `/microphone`
2. First client with VAD confidence > threshold takes control
3. If current mic owner becomes silent (>=10 consecutive silent chunks), other clients with voice can take control
4. On disconnect, a client relinquishes its mic control

---

## Implementation Notes

### Audio Data Serialization

**Python (Server)**:

```python
# Convert numpy array to bytes
audio_bytes = audio_data.tobytes()
```

**Python (Client)**:

```python
# Convert bytes to numpy array
audio_data = np.frombuffer(raw_bytes, dtype=np.float32)
```

### Message Flow Examples

#### Speaker Endpoint Flow

```text
Client connects to /speaker
Client: room:Living Room

Server: time:1704067200.123
Server: sampleRate:16000
Server: <raw float32 audio bytes>
Client: played
```

#### Microphone Endpoint Flow

```text
Client connects to /microphone

Client: room:Living Room
Server: sampleRate:16000

Client: <raw float32 audio bytes>
Client: <raw float32 audio bytes>
Client: <raw float32 audio bytes>
```

### Synchronization

For precise speaker synchronization, clients can use the sync ping/pong mechanism:

```text
Client connects to /speaker

Client: sync_ping
Server: sync_pong:<timestamp>
```

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ dependencies = [
"rich>=14.0.0",
"threadpoolctl>=3.0.0",
"mcp>=1.25.0",
"websockets>=16.0",
]

[project.optional-dependencies]
Expand Down
26 changes: 21 additions & 5 deletions src/glados/audio_io/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
"""

import queue
from typing import Protocol
from typing import Protocol, Any

import numpy as np
from numpy.typing import NDArray
Expand All @@ -26,7 +26,7 @@ def __init__(self, vad_threshold: float | None = None) -> None: ...
def start_listening(self) -> None: ...
def stop_listening(self) -> None: ...
def start_speaking(
self, audio_data: NDArray[np.float32], sample_rate: int | None = None, text: str = ""
self, audio_data: NDArray[np.float32], sample_rate: int | None = None, text: str = "", wait: bool = False
) -> None: ...
def measure_percentage_spoken(self, total_samples: int, sample_rate: int | None = None) -> tuple[bool, int]: ...
def check_if_speaking(self) -> bool: ...
Expand All @@ -35,14 +35,23 @@ def get_sample_queue(self) -> queue.Queue[tuple[NDArray[np.float32], bool]]: ...


# Factory function
def get_audio_system(backend_type: str = "sounddevice", vad_threshold: float | None = None) -> AudioProtocol:
def get_audio_system(backend_type: str = "sounddevice", backend_options: dict[str, Any] | None = None, vad_threshold: float | None = None) -> AudioProtocol:
"""
Factory function to get an instance of an audio I/O system based on the specified backend type.

Parameters:
backend_type (str): The type of audio backend to use:
- "sounddevice": Uses the sounddevice library for local audio I/O
- "websocket": Network-based audio I/O (not yet implemented)
- "websocket": Network-based audio I/O
backend_options: Options for the specified backend.
- "sounddevice": No options are allowed.
- "websocket": The following options are allowed:
- server: Websocket listening address (default: 127.0.0.1)
- port: Websocket listening port (default: 5051)
- speaker_sync_delay_ms: Milliseconds to add to each speak start time to account for speaker synchronisation (default: 250)
- mic_max_silence_chunks: How many consecutive VAD chunks must be silent so that the current microphone relinquishes control (default: 10)
- default_room_tag: The default room tag to use if a client doesn't set it (default: office)
- segregate_speakers: If `True`, audio is only sent to speakers with the same room tag as the last active microphone
vad_threshold (float | None): Optional threshold for voice activity detection

Returns:
Expand All @@ -54,11 +63,18 @@ def get_audio_system(backend_type: str = "sounddevice", vad_threshold: float | N
if backend_type == "sounddevice":
from .sounddevice_io import SoundDeviceAudioIO

if backend_options is not None:
raise ValueError("Sounddevice backend does not support options")

# noinspection PyTypeChecker
return SoundDeviceAudioIO(
vad_threshold=vad_threshold,
)
elif backend_type == "websocket":
raise ValueError("WebSocket audio backend is not yet implemented.")
from .websocket_io import WebsocketAudioIO

# noinspection PyTypeChecker
return WebsocketAudioIO(vad_threshold=vad_threshold, options=backend_options)
else:
raise ValueError(f"Unsupported audio backend type: {backend_type}")

Expand Down
7 changes: 5 additions & 2 deletions src/glados/audio_io/sounddevice_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,19 +117,20 @@ def stop_listening(self) -> None:
finally:
self.input_stream = None

def start_speaking(self, audio_data: NDArray[np.float32], sample_rate: int | None = None, text: str = "") -> None:
def start_speaking(self, audio_data: NDArray[np.float32], sample_rate: int | None = None, text: str = "", wait: bool = False) -> None:
"""Play audio through the system speakers.

Parameters:
audio_data: The audio data to play as a numpy float32 array
sample_rate: The sample rate of the audio data in Hz
text: Optional text associated with the audio (not used by this implementation)
wait: Optionally wait for the audio_data to be spoken

Raises:
RuntimeError: If audio playback cannot be initiated
ValueError: If audio_data is empty or not a valid numpy array
"""
if not isinstance(audio_data, np.ndarray) or audio_data.size == 0:
if not isinstance(audio_data, np.ndarray) or audio_data.size == 0 or audio_data.dtype != np.float32:
raise ValueError("Invalid audio data")

if sample_rate is None:
Expand All @@ -144,6 +145,8 @@ def start_speaking(self, audio_data: NDArray[np.float32], sample_rate: int | Non
logger.debug(f"Playing audio with sample rate: {sample_rate} Hz, length: {len(audio_data)} samples")
self._is_playing = True
sd.play(audio_data, sample_rate)
if wait:
sd.wait()

def measure_percentage_spoken(self, total_samples: int, sample_rate: int | None = None) -> tuple[bool, int]:
"""
Expand Down
Loading