diff --git a/Makefile b/Makefile index 7065549..4f1b0bd 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -.PHONY: help check test fmt lint doc ci accuracy mel +.PHONY: help check test fmt lint doc ci accuracy mel example-controller help: @echo "Available targets:" @@ -10,6 +10,7 @@ help: @echo " lint Run clippy with warnings as errors" @echo " doc Build and open docs in browser" @echo " ci Run all CI checks locally (fmt, clippy, test, doc, features)" + @echo " example-controller Run TurnController example" # Check workspace compiles check: @@ -27,6 +28,10 @@ accuracy: mel: cargo test --features pipecat -- mel_report --ignored --nocapture +# Run TurnController example +example-controller: + cargo run --features pipecat --example controller + # Format code fmt: cargo fmt --all diff --git a/README.md b/README.md index 97d850d..c560e4b 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ models behind common Rust traits. Same pattern as [wavekat-vad](https://github.com/wavekat/wavekat-vad). > [!WARNING] -> Early development. Trait API is defined; backend implementations are stubs pending ONNX model integration. +> Early development. API may change between minor versions. ## Backends @@ -27,25 +27,34 @@ models behind common Rust traits. Same pattern as cargo add wavekat-turn --features pipecat ``` -Use the audio-based detector: +Use `TurnController` to wrap any detector with automatic state tracking: ```rust -use wavekat_turn::{AudioTurnDetector, TurnState}; +use wavekat_turn::{TurnController, TurnState}; use wavekat_turn::audio::PipecatSmartTurn; -let mut detector = PipecatSmartTurn::new()?; +let detector = PipecatSmartTurn::new()?; +let mut ctrl = TurnController::new(detector); -// Feed 16 kHz f32 PCM frames after VAD detects silence -let prediction = detector.predict_audio(&audio_frames)?; +// Feed audio continuously +ctrl.push_audio(&audio_frame); +// VAD speech start — soft reset (keeps buffer if turn was unfinished) +ctrl.reset_if_finished(); + +// VAD speech end — predict +let prediction = ctrl.predict()?; match prediction.state { TurnState::Finished => { /* user is done, send to LLM */ } TurnState::Unfinished => { /* keep listening */ } TurnState::Wait => { /* user asked AI to hold */ } } + +// After assistant finishes responding — hard reset +ctrl.reset(); ``` -Or the text-based detector: +Or the text-based detector directly: ```rust use wavekat_turn::{TextTurnDetector, TurnState}; @@ -57,6 +66,9 @@ let prediction = detector.predict_text("I was wondering if", &context)?; assert_eq!(prediction.state, TurnState::Unfinished); ``` +See [`examples/controller.rs`](crates/wavekat-turn/examples/controller.rs) for a +full walkthrough with real audio. + ## Architecture Two trait families cover the two input modalities: @@ -64,6 +76,9 @@ Two trait families cover the two input modalities: - **`AudioTurnDetector`** -- operates on raw audio frames (no ASR needed) - **`TextTurnDetector`** -- operates on ASR transcript text with optional conversation context +`TurnController` wraps any `AudioTurnDetector` and adds orchestration helpers +like soft-reset (preserves buffer when the user pauses mid-sentence). + ``` wavekat-vad --> "is someone speaking?" wavekat-turn --> "are they done speaking?" diff --git a/crates/wavekat-turn/Cargo.toml b/crates/wavekat-turn/Cargo.toml index 9284d18..6d76eb1 100644 --- a/crates/wavekat-turn/Cargo.toml +++ b/crates/wavekat-turn/Cargo.toml @@ -36,6 +36,10 @@ ndarray-npy = "0.10" serde = { version = "1", features = ["derive"] } serde_json = "1" +[[example]] +name = "controller" +required-features = ["pipecat"] + [package.metadata.docs.rs] all-features = true rustdoc-args = ["--cfg", "docsrs"] diff --git a/crates/wavekat-turn/examples/controller.rs b/crates/wavekat-turn/examples/controller.rs new file mode 100644 index 0000000..4707385 --- /dev/null +++ b/crates/wavekat-turn/examples/controller.rs @@ -0,0 +1,100 @@ +//! Example: using TurnController for VAD-driven turn detection. +//! +//! Run with: `cargo run --features pipecat --example controller` +//! +//! Demonstrates the soft-reset flow using real WAV fixtures: +//! +//! 1. User speaks mid-sentence (speech_mid.wav) → Unfinished +//! 2. User continues speaking — soft reset keeps the buffer intact +//! 3. User finishes the sentence (speech_finished.wav) → Finished +//! 4. After assistant responds, hard reset starts a fresh turn + +use std::path::Path; + +use wavekat_turn::audio::PipecatSmartTurn; +use wavekat_turn::{AudioFrame, TurnController}; + +fn load_wav(path: &Path) -> Vec { + let mut reader = hound::WavReader::open(path) + .unwrap_or_else(|e| panic!("failed to open {}: {}", path.display(), e)); + let spec = reader.spec(); + match spec.sample_format { + hound::SampleFormat::Int => reader + .samples::() + .map(|s| s.unwrap() as f32 / 32768.0) + .collect(), + hound::SampleFormat::Float => reader.samples::().map(|s| s.unwrap()).collect(), + } +} + +fn main() -> Result<(), Box> { + let fixtures = Path::new(env!("CARGO_MANIFEST_DIR")) + .parent() + .unwrap() + .parent() + .unwrap() + .join("tests/fixtures"); + + let speech_mid = load_wav(&fixtures.join("speech_mid.wav")); + let speech_finished = load_wav(&fixtures.join("speech_finished.wav")); + + let detector = PipecatSmartTurn::new()?; + let mut ctrl = TurnController::new(detector); + + // --- Speech A: user says something mid-sentence --- + println!(">> VAD: speech started"); + ctrl.reset_if_finished(); // first speech → resets + + println!(">> Pushing speech_mid.wav (cut mid-sentence)"); + ctrl.push_audio(&AudioFrame::new(&speech_mid[..], 16_000)); + + println!(">> VAD: speech ended"); + let result_a = ctrl.predict()?; + println!( + " predict → {:?} (confidence: {:.3})", + result_a.state, result_a.confidence + ); + + // --- Speech B: user continues speaking --- + println!("\n>> VAD: speech started again"); + let did_reset = ctrl.reset_if_finished(); + println!( + " reset_if_finished → {}", + if did_reset { + "reset (turn was finished)" + } else { + "skipped (turn unfinished, keeping buffer)" + } + ); + + println!(">> Pushing speech_finished.wav (complete sentence)"); + ctrl.push_audio(&AudioFrame::new(&speech_finished[..], 16_000)); + + println!(">> VAD: speech ended"); + let result_b = ctrl.predict()?; + println!( + " predict → {:?} (confidence: {:.3}, ran on A+B combined)", + result_b.state, result_b.confidence + ); + + // --- New turn: after assistant responds --- + println!("\n>> Assistant finished responding"); + ctrl.reset(); // hard reset for next turn + println!(" hard reset, last_state: {:?}", ctrl.last_state()); + + // --- Speech C: fresh turn --- + println!("\n>> VAD: speech started (new turn)"); + ctrl.reset_if_finished(); // last_state is None → resets + + println!(">> Pushing speech_finished.wav"); + ctrl.push_audio(&AudioFrame::new(&speech_finished[..], 16_000)); + + println!(">> VAD: speech ended"); + let result_c = ctrl.predict()?; + println!( + " predict → {:?} (confidence: {:.3})", + result_c.state, result_c.confidence + ); + + Ok(()) +} diff --git a/crates/wavekat-turn/src/controller.rs b/crates/wavekat-turn/src/controller.rs new file mode 100644 index 0000000..163e4df --- /dev/null +++ b/crates/wavekat-turn/src/controller.rs @@ -0,0 +1,100 @@ +use crate::{AudioFrame, AudioTurnDetector, TurnError, TurnPrediction, TurnState}; + +/// Orchestration wrapper around any [`AudioTurnDetector`]. +/// +/// Tracks prediction state across calls and provides convenience methods +/// like [`reset_if_finished`](TurnController::reset_if_finished) for +/// correct VAD integration without manual state bookkeeping. +/// +/// # Usage +/// +/// ```ignore +/// let detector = PipecatSmartTurn::new()?; +/// let mut ctrl = TurnController::new(detector); +/// +/// // Audio arrives continuously +/// ctrl.push_audio(&frame); +/// +/// // VAD speech start — soft reset (keeps buffer if turn was unfinished) +/// ctrl.reset_if_finished(); +/// +/// // VAD speech end — predict +/// let result = ctrl.predict()?; +/// ``` +/// +/// See [`reset_if_finished`](TurnController::reset_if_finished) for details +/// on when to use soft vs hard reset. +pub struct TurnController { + inner: T, + last_state: Option, +} + +impl TurnController { + /// Create a new controller wrapping the given detector. + pub fn new(inner: T) -> Self { + Self { + inner, + last_state: None, + } + } + + /// Feed audio into the detector. + pub fn push_audio(&mut self, frame: &AudioFrame) { + self.inner.push_audio(frame); + } + + /// Run prediction on buffered audio. + /// + /// Tracks the result state internally for [`reset_if_finished`](Self::reset_if_finished). + pub fn predict(&mut self) -> Result { + let result = self.inner.predict()?; + self.last_state = Some(result.state); + Ok(result) + } + + /// Hard reset — always clears the buffer. + /// + /// Use when you know a new turn is starting (e.g. after the assistant + /// finishes responding). + pub fn reset(&mut self) { + self.inner.reset(); + self.last_state = None; + } + + /// Soft reset — clears the buffer only if the last prediction was + /// [`Finished`](TurnState::Finished) or no prediction has been made + /// since the last reset. + /// + /// Returns `true` if a reset occurred, `false` if skipped. + /// + /// Call this on VAD speech-start when you don't know whether the user + /// is continuing the same turn or starting a new one. If the previous + /// prediction was [`Unfinished`](TurnState::Unfinished), the buffer is + /// preserved so the next [`predict`](Self::predict) runs on the full + /// accumulated audio. + pub fn reset_if_finished(&mut self) -> bool { + match self.last_state { + Some(TurnState::Unfinished) => false, + _ => { + self.reset(); + true + } + } + } + + /// Returns the state from the last [`predict`](Self::predict) call, + /// or `None` if no prediction has been made since the last reset. + pub fn last_state(&self) -> Option { + self.last_state + } + + /// Returns a mutable reference to the inner detector. + pub fn inner_mut(&mut self) -> &mut T { + &mut self.inner + } + + /// Unwrap the controller, returning the inner detector. + pub fn into_inner(self) -> T { + self.inner + } +} diff --git a/crates/wavekat-turn/src/lib.rs b/crates/wavekat-turn/src/lib.rs index 98ca819..4c6f020 100644 --- a/crates/wavekat-turn/src/lib.rs +++ b/crates/wavekat-turn/src/lib.rs @@ -9,6 +9,10 @@ //! - [`AudioTurnDetector`] — operates on raw audio frames (e.g. Pipecat Smart Turn) //! - [`TextTurnDetector`] — operates on ASR transcript text (e.g. LiveKit EOU) //! +//! For most use cases, wrap a detector in [`TurnController`] to get +//! automatic state tracking and soft-reset logic for VAD integration. +//! See [`controller`] for details. +//! //! # Feature flags //! //! | Feature | Backend | Input | @@ -16,6 +20,7 @@ //! | `pipecat` | Pipecat Smart Turn v3 (ONNX) | Audio (16 kHz) | //! | `livekit` | LiveKit Turn Detector (ONNX) | Text | +pub mod controller; pub mod error; #[cfg(any(feature = "pipecat", feature = "livekit"))] @@ -27,6 +32,7 @@ pub mod audio; #[cfg(feature = "livekit")] pub mod text; +pub use controller::TurnController; pub use error::TurnError; pub use wavekat_core::AudioFrame; @@ -77,11 +83,23 @@ pub enum Role { /// Turn detector that operates on raw audio. /// /// Implementations buffer audio internally and run prediction on demand. -/// The typical flow with VAD: +/// +/// **Most users should wrap this in [`TurnController`]** rather than calling +/// these methods directly. The controller tracks prediction state and provides +/// [`reset_if_finished`](TurnController::reset_if_finished) for correct +/// multi-utterance handling. +/// +/// # Direct usage (advanced) +/// +/// If you need full control over reset logic: /// /// 1. **Every audio chunk** → [`push_audio`](AudioTurnDetector::push_audio) -/// 2. **VAD fires "speech started"** → [`reset`](AudioTurnDetector::reset) -/// 3. **VAD fires "speech stopped"** → [`predict`](AudioTurnDetector::predict) +/// 2. **VAD fires "speech stopped"** → [`predict`](AudioTurnDetector::predict) +/// 3. **New turn begins** → [`reset`](AudioTurnDetector::reset) +/// +/// Note: calling `reset` unconditionally on every VAD speech-start will discard +/// audio context when the user pauses mid-sentence. See [`TurnController`] for +/// the recommended approach. pub trait AudioTurnDetector: Send + Sync { /// Feed audio into the internal buffer. /// @@ -90,10 +108,17 @@ pub trait AudioTurnDetector: Send + Sync { /// Run prediction on buffered audio. /// - /// Call when VAD detects end of speech. + /// Call when VAD detects end of speech. The buffer is **not** cleared + /// after prediction — call [`reset`](AudioTurnDetector::reset) explicitly + /// when starting a new turn. fn predict(&mut self) -> Result; - /// Clear the internal buffer. Call when a new speech turn begins. + /// Unconditionally clear the internal buffer. + /// + /// Use when you are certain a new turn is starting (e.g. after the + /// assistant finishes responding). For VAD speech-start events where + /// the user may be continuing, prefer + /// [`TurnController::reset_if_finished`]. fn reset(&mut self); } diff --git a/crates/wavekat-turn/tests/controller.rs b/crates/wavekat-turn/tests/controller.rs new file mode 100644 index 0000000..e8edf88 --- /dev/null +++ b/crates/wavekat-turn/tests/controller.rs @@ -0,0 +1,163 @@ +//! Tests for [`TurnController`]. +//! +//! Uses a mock detector to test orchestration logic without ONNX overhead. + +use wavekat_turn::{ + AudioFrame, AudioTurnDetector, TurnController, TurnError, TurnPrediction, TurnState, +}; + +// --------------------------------------------------------------------------- +// Mock detector +// --------------------------------------------------------------------------- + +/// A minimal detector that records calls and returns a configurable state. +struct MockDetector { + /// The state to return on the next `predict()` call. + next_state: TurnState, + /// Number of samples in the buffer (cleared by reset). + buffer_len: usize, + /// How many times `reset()` was called. + reset_count: usize, +} + +impl MockDetector { + fn new() -> Self { + Self { + next_state: TurnState::Unfinished, + buffer_len: 0, + reset_count: 0, + } + } +} + +impl AudioTurnDetector for MockDetector { + fn push_audio(&mut self, frame: &AudioFrame) { + self.buffer_len += frame.samples().len(); + } + + fn predict(&mut self) -> Result { + let state = self.next_state; + let confidence = match state { + TurnState::Finished => 0.95, + TurnState::Unfinished => 0.80, + TurnState::Wait => 0.70, + }; + Ok(TurnPrediction { + state, + confidence, + latency_ms: 0, + stage_times: vec![], + }) + } + + fn reset(&mut self) { + self.buffer_len = 0; + self.reset_count += 1; + } +} + +// --------------------------------------------------------------------------- +// Tests +// --------------------------------------------------------------------------- + +#[test] +fn reset_if_finished_resets_on_first_call() { + let mut ctrl = TurnController::new(MockDetector::new()); + assert!( + ctrl.reset_if_finished(), + "should reset when no prior prediction" + ); +} + +#[test] +fn reset_if_finished_skips_after_unfinished() { + let mut ctrl = TurnController::new(MockDetector::new()); + ctrl.inner_mut().next_state = TurnState::Unfinished; + ctrl.predict().unwrap(); + + assert!( + !ctrl.reset_if_finished(), + "should skip reset after Unfinished" + ); +} + +#[test] +fn reset_if_finished_resets_after_finished() { + let mut ctrl = TurnController::new(MockDetector::new()); + ctrl.inner_mut().next_state = TurnState::Finished; + ctrl.predict().unwrap(); + + assert!(ctrl.reset_if_finished(), "should reset after Finished"); +} + +#[test] +fn hard_reset_always_clears() { + let mut ctrl = TurnController::new(MockDetector::new()); + ctrl.inner_mut().next_state = TurnState::Unfinished; + ctrl.predict().unwrap(); + + ctrl.reset(); + assert_eq!( + ctrl.last_state(), + None, + "hard reset should clear last_state" + ); + assert_eq!(ctrl.inner_mut().reset_count, 1); +} + +#[test] +fn last_state_tracks_predictions() { + let mut ctrl = TurnController::new(MockDetector::new()); + assert_eq!(ctrl.last_state(), None); + + ctrl.inner_mut().next_state = TurnState::Unfinished; + ctrl.predict().unwrap(); + assert_eq!(ctrl.last_state(), Some(TurnState::Unfinished)); + + ctrl.inner_mut().next_state = TurnState::Finished; + ctrl.predict().unwrap(); + assert_eq!(ctrl.last_state(), Some(TurnState::Finished)); + + ctrl.reset(); + assert_eq!(ctrl.last_state(), None); +} + +#[test] +fn predict_accumulates_across_soft_reset() { + let mut ctrl = TurnController::new(MockDetector::new()); + + // Speech A + let frame_a = AudioFrame::new(&[0.1f32; 1600][..], 16_000).into_owned(); + ctrl.push_audio(&frame_a); + ctrl.inner_mut().next_state = TurnState::Unfinished; + ctrl.predict().unwrap(); + + // Soft reset — should NOT clear buffer + assert!(!ctrl.reset_if_finished()); + + // Speech B + let frame_b = AudioFrame::new(&[0.2f32; 1600][..], 16_000).into_owned(); + ctrl.push_audio(&frame_b); + + // Buffer should contain both A and B + assert_eq!( + ctrl.inner_mut().buffer_len, + 3200, + "buffer should have A + B samples" + ); + assert_eq!( + ctrl.inner_mut().reset_count, + 0, + "no resets should have occurred" + ); +} + +#[test] +fn into_inner_returns_detector() { + let mut ctrl = TurnController::new(MockDetector::new()); + let frame = AudioFrame::new(&[0.0f32; 160][..], 16_000).into_owned(); + ctrl.push_audio(&frame); + + let detector = ctrl.into_inner(); + assert_eq!(detector.buffer_len, 160); +} diff --git a/docs/plan-backends.md b/docs/plan-backends.md index ec17c97..57c59a0 100644 --- a/docs/plan-backends.md +++ b/docs/plan-backends.md @@ -27,11 +27,13 @@ ## Current state `PipecatSmartTurn` is fully implemented and all integration tests pass. +`TurnController` wraps any `AudioTurnDetector` with state tracking and soft-reset. `LiveKitEou` remains a stub (out of scope for this branch). ``` src/ ├── lib.rs — traits: AudioTurnDetector, TextTurnDetector, TurnPrediction, TurnState +├── controller.rs — TurnController orchestration wrapper ├── error.rs — TurnError: BackendError, InvalidInput, ModelNotLoaded ├── onnx.rs — shared session_from_file / session_from_memory helpers ├── audio/ @@ -41,7 +43,10 @@ src/ ├── mod.rs └── livekit.rs — LiveKitEou (stub, out of scope) build.rs — downloads smart-turn-v3.2-cpu.onnx at build time +examples/ +└── controller.rs — TurnController usage with real WAV fixtures tests/ +├── controller.rs — 7 TurnController tests (mock detector) └── pipecat.rs — 9 integration tests (all pass) ``` diff --git a/docs/plan-turn-controller.md b/docs/plan-turn-controller.md new file mode 100644 index 0000000..8b16bc4 --- /dev/null +++ b/docs/plan-turn-controller.md @@ -0,0 +1,224 @@ +# Plan: TurnController Wrapper + +**Status:** Complete +**Date:** 2026-03-31 + +--- + +## Problem + +The `AudioTurnDetector` trait documents a simple flow: + +1. Every audio chunk → `push_audio` +2. VAD fires "speech started" → `reset` +3. VAD fires "speech stopped" → `predict` + +This works for the basic case, but breaks when the user continues speaking after a +brief pause — a common pattern in natural conversation (e.g. "I want to order... um... +a pizza"). + +### What goes wrong + +Consider this sequence: + +``` +VAD speech start → reset() buffer cleared + (push_audio) buffer has speech A +VAD speech end → predict() → Unfinished +VAD speech start → reset() ← WRONG: clears speech A + (push_audio) buffer has speech B only +VAD speech end → predict() runs on B alone, missing context +``` + +The Pipecat Smart Turn documentation explicitly says: + +> If additional speech is detected from the user before Smart Turn has finished +> executing, re-run Smart Turn on the entire turn recording, including the new audio, +> rather than just the new segment. Smart Turn works best when given sufficient context, +> and is not designed to run on very short audio segments. + +The correct behavior is to **skip the reset** when the previous prediction was +`Unfinished`, so the buffer accumulates across the full turn: + +``` +VAD speech start → reset() buffer cleared (first speech) + (push_audio) buffer has speech A +VAD speech end → predict() → Unfinished +VAD speech start → DON'T reset buffer keeps speech A + (push_audio) buffer has speech A + B +VAD speech end → predict() runs on A+B combined ✓ +``` + +### Why this doesn't belong in the trait + +The `AudioTurnDetector` trait is the right abstraction for backend authors — it's +minimal, and `reset()` is a clean primitive ("clear everything"). The soft-reset +decision depends on tracking the last prediction state, which is orchestration logic. + +Every orchestrator would have to re-implement this same logic. As a library, we should +provide a helper that does it correctly out of the box. + +--- + +## Solution: `TurnController` + +A generic wrapper around any `AudioTurnDetector` that tracks prediction state and +provides convenience methods. + +```rust +pub struct TurnController { + inner: T, + last_state: Option, +} +``` + +### API + +```rust +impl TurnController { + /// Create a new controller wrapping the given detector. + pub fn new(inner: T) -> Self; + + /// Feed audio into the detector. + pub fn push_audio(&mut self, frame: &AudioFrame); + + /// Run prediction on buffered audio. + /// Tracks the result state internally for `reset_if_finished`. + pub fn predict(&mut self) -> Result; + + /// Hard reset — always clears the buffer. Use when you know a new turn + /// is starting (e.g. after the assistant finishes responding). + pub fn reset(&mut self); + + /// Soft reset — clears the buffer only if the last prediction was + /// `Finished` (or no prediction has been made yet). Returns whether + /// a reset actually occurred. + /// + /// Call this on VAD speech-start when you don't know whether the user + /// is continuing the same turn or starting a new one. + pub fn reset_if_finished(&mut self) -> bool; + + /// Returns the state from the last `predict()` call, or `None` if + /// no prediction has been made since the last reset. + pub fn last_state(&self) -> Option; + + /// Unwrap the controller, returning the inner detector. + pub fn into_inner(self) -> T; +} +``` + +### Usage + +```rust +let detector = PipecatSmartTurn::new()?; +let mut ctrl = TurnController::new(detector); + +// Audio arrives continuously +ctrl.push_audio(&frame); + +// VAD speech start — soft reset (keeps buffer if turn was unfinished) +ctrl.reset_if_finished(); + +// VAD speech end — predict +let result = ctrl.predict()?; +match result.state { + TurnState::Finished => { /* hand off to LLM */ } + TurnState::Unfinished => { /* wait for more speech */ } +} + +// After assistant finishes responding — hard reset for next turn +ctrl.reset(); +``` + +### Scenario walkthrough + +```rust +// Speech A — user says "I want to order..." +ctrl.reset_if_finished(); // no prior prediction → resets ✓ +ctrl.push_audio(&speech_a); +let a = ctrl.predict()?; // → Unfinished + +// Speech B — user continues "...a pizza" +ctrl.reset_if_finished(); // last was Unfinished → NO reset ✓ +ctrl.push_audio(&speech_b); +let b = ctrl.predict()?; // runs on A+B combined → Finished ✓ + +// Speech C — new conversation turn +ctrl.reset(); // hard reset after assistant responded +ctrl.push_audio(&speech_c); +let c = ctrl.predict()?; // runs on C only ✓ +``` + +--- + +## Design decisions + +### Why `TurnController` and not a trait method + +- Rust traits can't have fields, so every implementor would duplicate the + `last_state` tracking boilerplate. +- The soft-reset logic is identical across all backends — it only depends on + `TurnState`, not on backend internals. +- A wrapper keeps the trait minimal for backend authors while giving orchestrators + a batteries-included API. + +### Why `reset_if_finished` returns `bool` + +The orchestrator may want to know whether a reset occurred — e.g. for logging, +or to adjust behavior (start a new transcript vs. append to existing). + +### Why keep `reset()` on the controller + +Hard reset is still needed for cases the controller can't infer: +- After the assistant finishes responding (new conversation turn). +- Manual override / error recovery. +- First initialization. + +`reset_if_finished()` is the default for VAD speech-start events. +`reset()` is for explicit turn boundaries the orchestrator controls. + +--- + +## Future possibilities + +These are not part of the initial implementation but the `TurnController` is a +natural place to add them later: + +- **Min audio guard** — `predict()` returns early if the buffer is too short to + produce a meaningful prediction, avoiding wasted inference on tiny audio segments. +- **Configurable threshold** — override the default 0.5 probability threshold + without modifying the detector. +- **Prediction history** — track recent predictions for debugging and logging. + +--- + +## File placement + +``` +src/ +├── lib.rs — existing traits (unchanged) +├── controller.rs — TurnController ← NEW +├── audio/ +│ └── pipecat.rs — PipecatSmartTurn (unchanged) +└── ... +``` + +Re-export from `lib.rs`: + +```rust +mod controller; +pub use controller::TurnController; +``` + +--- + +## Tests + +| Test | What it checks | +|------|---------------| +| `reset_if_finished_resets_on_first_call` | No prior prediction → resets | +| `reset_if_finished_skips_after_unfinished` | Last predict was Unfinished → no reset | +| `reset_if_finished_resets_after_finished` | Last predict was Finished → resets | +| `hard_reset_always_clears` | `reset()` clears regardless of last state | +| `last_state_tracks_predictions` | `last_state()` returns correct value after predict/reset | +| `predict_accumulates_across_soft_reset` | Buffer preserved when soft reset skips → predict uses full audio |