magicnight · magicnight · May 10, 2026 · May 10, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,6 +10,14 @@ Versioning follows [Semantic Versioning](https://semver.org/).
 ## [Unreleased]
 
 ### Added
+- **Settings audio fields** (v0.6 audio foundation). Schema-only —
+  no runtime audio yet, just persistence so the v0.6 STT / TTS
+  feature work has settled storage to talk to. Five new keys with
+  audio-off defaults: `audioEnabled`, `sttModel`, `ttsModel`,
+  `ttsVoice`, `ttsAutoSpeak`. Backwards-compatible decode: pre-v0.6
+  `~/.mac-mlx/settings.json` files load unchanged (every new key
+  decodes via `decodeIfPresent` and falls back to "audio off"). 3
+  new tests cover defaults / round-trip / legacy-JSON decode.
 - **MCP Client Config** (v0.5 MCP track, part 1 of 2). Pure-Swift
   data layer for connecting macMLX to external MCP servers (mirror
   of v0.4.0's MCP server role, but reversed: now we *are* the host

diff --git a/MacMLXCore/Sources/MacMLXCore/Managers/SettingsManager.swift b/MacMLXCore/Sources/MacMLXCore/Managers/SettingsManager.swift
@@ -69,6 +69,32 @@ public struct Settings: Codable, Equatable, Sendable {
     /// for small-memory Macs.
     public var maxResidentMemoryGB: Int
 
+    // MARK: - Speech I/O (v0.6+)
+
+    /// Master toggle for speech features — `false` keeps mic capture
+    /// + TTS playback completely off, mirrors the v0.6 first-run UX.
+    public var audioEnabled: Bool
+
+    /// Identifier of the STT model to load on demand
+    /// (e.g. `whisper-small`, `whisper-medium`, `whisper-large-v3`,
+    /// `fun-asr`). Nil means "user hasn't picked one" — the chat
+    /// input's mic button surfaces a one-shot picker on first use.
+    public var sttModel: String?
+
+    /// Identifier of the TTS model
+    /// (e.g. `marvis`, `chatterbox`, `cosyvoice2`). Nil = no TTS
+    /// model picked.
+    public var ttsModel: String?
+
+    /// Voice id passed to the TTS model. Voice cloning works by
+    /// pointing this at a `~/.mac-mlx/audio/voices/<name>.wav`
+    /// reference clip. Nil = use the model's default voice.
+    public var ttsVoice: String?
+
+    /// Auto-speak completed assistant replies. False (default) keeps
+    /// playback opt-in via the per-bubble speaker button.
+    public var ttsAutoSpeak: Bool
+
     // MARK: Factory
 
     /// Sensible out-of-the-box defaults — used when no settings file exists.
@@ -89,7 +115,12 @@ public struct Settings: Codable, Equatable, Sendable {
         hfEndpoint: "https://huggingface.co",
         kvCacheHotMB: 512,
         kvCacheColdGB: 20,
-        maxResidentMemoryGB: max(4, Int(MemoryProbe.totalMemoryGB()) / 2)
+        maxResidentMemoryGB: max(4, Int(MemoryProbe.totalMemoryGB()) / 2),
+        audioEnabled: false,
+        sttModel: nil,
+        ttsModel: nil,
+        ttsVoice: nil,
+        ttsAutoSpeak: false
     )
 
     // MARK: Init
@@ -108,7 +139,12 @@ public struct Settings: Codable, Equatable, Sendable {
         hfEndpoint: String = "https://huggingface.co",
         kvCacheHotMB: Int = 512,
         kvCacheColdGB: Int = 20,
-        maxResidentMemoryGB: Int = max(4, Int(MemoryProbe.totalMemoryGB()) / 2)
+        maxResidentMemoryGB: Int = max(4, Int(MemoryProbe.totalMemoryGB()) / 2),
+        audioEnabled: Bool = false,
+        sttModel: String? = nil,
+        ttsModel: String? = nil,
+        ttsVoice: String? = nil,
+        ttsAutoSpeak: Bool = false
     ) {
         self.modelDirectory = modelDirectory
         self.preferredEngine = preferredEngine
@@ -124,6 +160,11 @@ public struct Settings: Codable, Equatable, Sendable {
         self.kvCacheHotMB = kvCacheHotMB
         self.kvCacheColdGB = kvCacheColdGB
         self.maxResidentMemoryGB = maxResidentMemoryGB
+        self.audioEnabled = audioEnabled
+        self.sttModel = sttModel
+        self.ttsModel = ttsModel
+        self.ttsVoice = ttsVoice
+        self.ttsAutoSpeak = ttsAutoSpeak
     }
 
     // MARK: - Codable (backward-compat decode)
@@ -146,6 +187,11 @@ public struct Settings: Codable, Equatable, Sendable {
         case kvCacheHotMB
         case kvCacheColdGB
         case maxResidentMemoryGB
+        case audioEnabled
+        case sttModel
+        case ttsModel
+        case ttsVoice
+        case ttsAutoSpeak
     }
 
     public init(from decoder: Decoder) throws {
@@ -167,6 +213,14 @@ public struct Settings: Codable, Equatable, Sendable {
         self.maxResidentMemoryGB =
             (try c.decodeIfPresent(Int.self, forKey: .maxResidentMemoryGB))
             ?? max(4, Int(MemoryProbe.totalMemoryGB()) / 2)
+        // v0.6 audio fields — pre-v0.6 settings.json files don't carry
+        // them. Fall back to "audio off" so existing installs upgrade
+        // without surprise mic permission prompts.
+        self.audioEnabled = try c.decodeIfPresent(Bool.self, forKey: .audioEnabled) ?? false
+        self.sttModel = try c.decodeIfPresent(String.self, forKey: .sttModel)
+        self.ttsModel = try c.decodeIfPresent(String.self, forKey: .ttsModel)
+        self.ttsVoice = try c.decodeIfPresent(String.self, forKey: .ttsVoice)
+        self.ttsAutoSpeak = try c.decodeIfPresent(Bool.self, forKey: .ttsAutoSpeak) ?? false
     }
 }
 

diff --git a/MacMLXCore/Tests/MacMLXCoreTests/Managers/SettingsAudioTests.swift b/MacMLXCore/Tests/MacMLXCoreTests/Managers/SettingsAudioTests.swift
@@ -0,0 +1,62 @@
+import Testing
+import Foundation
+@testable import MacMLXCore
+
+@Suite("Settings audio fields (v0.6)")
+struct SettingsAudioTests {
+
+    @Test
+    func defaultSettingsHaveAudioOffAndNoModelsPicked() {
+        let s = Settings.default
+        #expect(s.audioEnabled == false)
+        #expect(s.sttModel == nil)
+        #expect(s.ttsModel == nil)
+        #expect(s.ttsVoice == nil)
+        #expect(s.ttsAutoSpeak == false)
+    }
+
+    @Test
+    func roundTripsThroughJSON() throws {
+        var s = Settings.default
+        s.audioEnabled = true
+        s.sttModel = "whisper-medium"
+        s.ttsModel = "marvis"
+        s.ttsVoice = "voices/clone-kevin.wav"
+        s.ttsAutoSpeak = true
+
+        let data = try JSONEncoder().encode(s)
+        let back = try JSONDecoder().decode(Settings.self, from: data)
+        #expect(back.audioEnabled == true)
+        #expect(back.sttModel == "whisper-medium")
+        #expect(back.ttsModel == "marvis")
+        #expect(back.ttsVoice == "voices/clone-kevin.wav")
+        #expect(back.ttsAutoSpeak == true)
+    }
+
+    /// Pre-v0.6 settings.json files don't carry any of the audio
+    /// keys — the decoder must default to "audio off" so existing
+    /// installs upgrade without surprise.
+    @Test
+    func legacyJSONWithoutAudioKeysDecodesWithAudioOff() throws {
+        let legacy = """
+        {
+            "modelDirectory": "file:///tmp/models",
+            "preferredEngine": "mlx-swift-lm",
+            "serverPort": 8000,
+            "autoStartServer": false,
+            "lastLoadedModel": null,
+            "onboardingComplete": true,
+            "pythonPath": null,
+            "swiftLMPath": null,
+            "sparkleUpdateChannel": "release",
+            "logRetentionDays": 7
+        }
+        """
+        let decoded = try JSONDecoder().decode(Settings.self, from: Data(legacy.utf8))
+        #expect(decoded.audioEnabled == false)
+        #expect(decoded.sttModel == nil)
+        #expect(decoded.ttsModel == nil)
+        #expect(decoded.ttsVoice == nil)
+        #expect(decoded.ttsAutoSpeak == false)
+    }
+}