Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ dependencies = [
"rich>=14.0.0",
"threadpoolctl>=3.0.0",
"mcp>=1.25.0",
"soxr>=0.3.0",
]

[project.optional-dependencies]
Expand Down
126 changes: 75 additions & 51 deletions src/glados/audio_io/sounddevice_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
import threading
from typing import Any

import soxr

from loguru import logger
import numpy as np
from numpy.typing import NDArray
Expand Down Expand Up @@ -44,7 +46,11 @@ def __init__(self, vad_threshold: float | None = None) -> None:

self._sample_queue: queue.Queue[tuple[NDArray[np.float32], bool]] = queue.Queue()
self.input_stream: sd.InputStream | None = None
self._output_stream: sd.OutputStream | None = None
self._is_playing = False
self._playback_position = 0
self._playback_audio: NDArray[np.float32] = np.array([], dtype=np.float32)
self._playback_done = threading.Event()
self._playback_thread = None
self._stop_event = threading.Event()

Expand Down Expand Up @@ -137,22 +143,53 @@ def start_speaking(self, audio_data: NDArray[np.float32], sample_rate: int | Non

# Stop any existing playback
self.stop_speaking()

# Reset the stop event
self._stop_event.clear()

logger.debug(f"Playing audio with sample rate: {sample_rate} Hz, length: {len(audio_data)} samples")
# Ensure mono float32
audio = np.asarray(audio_data, dtype=np.float32)
if audio.ndim > 1:
audio = audio[:, 0]

# Resample to device native rate if needed to avoid low-quality SRC in PortAudio
device_rate = int(sd.query_devices(kind="output")["default_samplerate"])
if sample_rate != device_rate:
audio = soxr.resample(audio, sample_rate, device_rate, quality="HQ")
sample_rate = device_rate

self._playback_audio = audio
self._playback_sample_rate = sample_rate
Comment on lines +159 to +160

@coderabbitai coderabbitai Bot May 27, 2026

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

Scale playback progress by sample-rate ratio.

_playback_position is counted in resampled device frames, but total_samples still comes from the original TTS buffer. When the rates differ, interruptions report the wrong percentage back to src/glados/core/speech_player.py—e.g. 22.05 kHz → 44.1 kHz will read roughly 100% at half the utterance.

💡 Proposed fix
         progress = getattr(self, "_playback_position", total_samples)
         self._is_playing = False
-        percentage_played = min(int(progress / total_samples * 100), 100) if total_samples > 0 else 100
+        if total_samples > 0 and sample_rate and self._playback_sample_rate > 0:
+            played_seconds = progress / self._playback_sample_rate
+            total_seconds = total_samples / sample_rate
+            percentage_played = min(int(played_seconds / total_seconds * 100), 100)
+        else:
+            percentage_played = 100
         return interrupted, percentage_played

Also applies to: 224-226

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@src/glados/audio_io/sounddevice_io.py` around lines 159 - 160,
_playback_position is tracked in device (resampled) frames while total_samples
is from the original TTS buffer, so progress must be scaled by the sample-rate
ratio; update the progress calculation wherever you compute percent/completion
(references: _playback_position, _playback_audio, _playback_sample_rate, and
total_samples) to multiply or divide by (_playback_sample_rate /
original_sample_rate) (or equivalent inverse) so the units match before
computing percentage, and apply the same change in the other occurrence noted
around the block that handles interruptions (the second spot mentioned near
lines 224-226).

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

make the fix. Also, can we avoid the new soxr dependency?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
cat pyproject.toml

Length of output: 5630



Happy to make both changes. Here's the plan:

  • Remove soxr>=0.3.0 from pyproject.toml
  • Drop the import soxr and replace soxr.resample(...) with a private _resample helper using np.interp (already available — no new dependency)
  • Fix measure_percentage_spoken to convert both progress and total_samples to seconds before computing percentage

Note on resampling quality: np.interp (linear interpolation) is simpler than soxr's HQ sinc filter. For typical TTS use-cases — especially integer upsampling ratios like 22 050 → 44 100 Hz — it is audibly transparent. If higher quality becomes important later, scipy.signal.resample_poly (scipy is a common optional dep) can be swapped in with a one-line change.


🧬 Code Graph Analysis Results

src/glados/audio_io/sounddevice_io.py

Lines 5-12 (relevant to avoiding the new soxr dependency):

import soxr

from loguru import logger
import numpy as np
from numpy.typing import NDArray
import sounddevice as sd  # type: ignore

from . import VAD

Lines ~104-137 (resampling uses soxr, likely what the user wants to remove):

        # Resample to device native rate if needed to avoid low-quality SRC in PortAudio
        device_rate = int(sd.query_devices(kind="output")["default_samplerate"])
        if sample_rate != device_rate:
            audio = soxr.resample(audio, sample_rate, device_rate, quality="HQ")
            sample_rate = device_rate

Lines ~60-84 (VAD chunk sizing; relevant if the “fix” relates to VAD compatibility):

            self.input_stream = sd.InputStream(
                samplerate=self.SAMPLE_RATE,
                channels=1,
                callback=audio_callback,
                blocksize=int(self.SAMPLE_RATE * self.VAD_SIZE / 1000),
            )

src/glados/audio_io/vad.py

Lines ~41-88 (VAD constraints that must match microphone chunk sizing / sample rate):

    def __call__(self, audio_sample: NDArray[np.float32], sample_rate: int = SAMPLE_RATE) -> NDArray[np.float32]:
        """Process a batch of audio samples and return the VAD output."""
        if sample_rate != 16000:
            raise ValueError("silero_vad_16k_op15.onnx only supports 16000 Hz audio.")
        num_samples = 512

        if audio_sample.shape[-1] != num_samples:
            raise ValueError(
                f"Provided number of samples is {audio_sample.shape[-1]} "
                f"(Supported values: 256 for 8000 sample rate, 512 for 16000)"
            )

Lines ~88-116 (context/stateful inference details):

        batch_size = audio_sample.shape[0]
        context_size = 64 if sample_rate == 16000 else 32

        if not self._last_batch_size:
            self.reset_states(batch_size)
        if (self._last_sr) and (self._last_sr != sample_rate):
            self.reset_states(batch_size)
        if (self._last_batch_size) and (self._last_batch_size != batch_size):
            self.reset_states(batch_size)

        if not len(self._context):
            self._context = np.zeros((batch_size, context_size), dtype=np.float32)

        audio_sample = np.concatenate([self._context, audio_sample], axis=1)

        ort_inputs = {
            "input": audio_sample.astype(np.float32),
            "state": self._state,
            "sr": np.array(sample_rate, dtype=np.int64),
        }
        ort_outs = self.ort_sess.run(None, ort_inputs)
        out, state = ort_outs

src/glados/audio_io/__init__.py

Lines 26-34 (method signatures for reference while adjusting behavior):

def start_listening(self) -> None: ...
def stop_listening(self) -> None: ...
def start_speaking(
        self, audio_data: NDArray[np.float32], sample_rate: int | None = None, text: str = ""
    ) -> None: ...
def measure_percentage_spoken(self, total_samples: int, sample_rate: int | None = None) -> tuple[bool, int]: ...
def check_if_speaking(self) -> bool: ...
def stop_speaking(self) -> None: ...
def get_sample_queue(self) -> queue.Queue[tuple[NDArray[np.float32], bool]]: ...

Failed to handle agent chat message. Please try again.

self._playback_position = 0
self._playback_done = threading.Event()
self._is_playing = True
sd.play(audio_data, sample_rate)

def _callback(outdata: NDArray[np.float32], frames: int, t: Any, status: sd.CallbackFlags) -> None:
if status:
logger.debug(f"Playback callback status: {status}")
pos = self._playback_position
chunk = self._playback_audio[pos : pos + frames]
if len(chunk) < frames:
outdata[: len(chunk), 0] = chunk
outdata[len(chunk) :, 0] = 0
self._playback_position += len(chunk)
self._playback_done.set()
raise sd.CallbackStop
else:
outdata[:, 0] = chunk
self._playback_position += frames

self._output_stream = sd.OutputStream(
samplerate=sample_rate,
channels=1,
dtype="float32",
callback=_callback,
finished_callback=self._playback_done.set,
)
logger.debug(f"Playing audio with sample rate: {sample_rate} Hz, length: {len(audio)} samples")
self._output_stream.start()

def measure_percentage_spoken(self, total_samples: int, sample_rate: int | None = None) -> tuple[bool, int]:
"""
Monitor audio playback progress and return completion status with interrupt detection.

Streams audio samples through PortAudio and actively tracks the number of samples
that have been played. The playback can be interrupted by setting self.processing
to False or self.shutdown_event. Uses a non-blocking callback system with a completion event for
synchronization.
Wait for playback to complete or be interrupted, returning progress.

Args:
total_samples (int): Total number of samples in the audio data being played.
Expand All @@ -165,43 +202,28 @@ def measure_percentage_spoken(self, total_samples: int, sample_rate: int | None
sample_rate = self.SAMPLE_RATE

interrupted = False
progress = 0
completion_event = threading.Event()

def stream_callback(
outdata: NDArray[np.float32], frames: int, time: dict[str, Any], status: sd.CallbackFlags
) -> None:
nonlocal progress, interrupted
progress += frames
if self._is_playing is False:
interrupted = True
completion_event.set()
if progress >= total_samples:
completion_event.set()
outdata.fill(0)

try:
logger.debug(f"Using sample rate: {sample_rate} Hz, total samples: {total_samples}")
stream = sd.OutputStream(
callback=stream_callback,
samplerate=sample_rate,
channels=1,
finished_callback=completion_event.set,
)
with stream:
# Add a reasonable maximum timeout to prevent indefinite blocking
max_timeout = total_samples / sample_rate
completed = completion_event.wait(max_timeout + 1) # Add a small buffer to ensure completion
if not completed:
# If the event timed out, force interruption
self._is_playing = False
poll_interval = 0.01

while True:
if self._is_playing is False:
interrupted = True
logger.debug("Audio playback timed out, forcing interruption")
break
done = self._playback_done.wait(timeout=poll_interval)
if done:
break

# Wait a tiny bit to let the stream finish cleanly
if not interrupted and hasattr(self, "_output_stream") and self._output_stream is not None:
self._output_stream.stop()

except (sd.PortAudioError, RuntimeError):
logger.debug("Audio stream already closed or invalid")
except Exception as e:
logger.debug(f"measure_percentage_spoken error: {e}")

percentage_played = min(int(progress / total_samples * 100), 100)
progress = getattr(self, "_playback_position", total_samples)
self._is_playing = False
percentage_played = min(int(progress / total_samples * 100), 100) if total_samples > 0 else 100
return interrupted, percentage_played

def check_if_speaking(self) -> bool:
Expand All @@ -213,17 +235,19 @@ def check_if_speaking(self) -> bool:
return self._is_playing

def stop_speaking(self) -> None:
"""Stop audio playback and clean up resources.

Interrupts any ongoing audio playback and waits for the playback thread
to terminate. This ensures clean resource management and prevents
multiple overlapping playbacks.
"""
"""Stop audio playback and clean up resources."""
if self._is_playing:
self._stop_event.set()
sd.stop()

self._is_playing = False
self._stop_event.set()
if hasattr(self, "_playback_done"):
self._playback_done.set()
if hasattr(self, "_output_stream") and self._output_stream is not None:
try:
self._output_stream.stop()
self._output_stream.close()
except Exception:
pass
self._output_stream = None

def get_sample_queue(self) -> queue.Queue[tuple[NDArray[np.float32], bool]]:
"""Get the queue containing audio samples and VAD confidence.
Expand Down