diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c88f1eb..da82702 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -40,13 +40,17 @@ env: jobs: spm: name: SPM Build & Test - runs-on: macos-15 + runs-on: macos-26 timeout-minutes: 30 steps: - uses: actions/checkout@v5 - - name: Pin Xcode 16.4 - run: sudo xcode-select -s /Applications/Xcode_16.4.app + # Xcode 26.4.1 ships Swift 6.3, which (unlike 6.0 in Xcode 16.4) + # tolerates mlx-swift-lm's `private let context = CIContext()` in + # MLXVLM/MediaProcessing.swift under strict concurrency. Pinning to + # the GA build (not the 26.5 beta) so toolchain churn stays low. + - name: Pin Xcode 26.4.1 + run: sudo xcode-select -s /Applications/Xcode_26.4.1.app - name: Cache SPM checkouts + build uses: actions/cache@v5 @@ -81,13 +85,17 @@ jobs: app: name: Xcode App Build - runs-on: macos-15 + runs-on: macos-26 timeout-minutes: 30 steps: - uses: actions/checkout@v5 - - name: Pin Xcode 16.4 - run: sudo xcode-select -s /Applications/Xcode_16.4.app + # Xcode 26.4.1 ships Swift 6.3, which (unlike 6.0 in Xcode 16.4) + # tolerates mlx-swift-lm's `private let context = CIContext()` in + # MLXVLM/MediaProcessing.swift under strict concurrency. Pinning to + # the GA build (not the 26.5 beta) so toolchain churn stays low. + - name: Pin Xcode 26.4.1 + run: sudo xcode-select -s /Applications/Xcode_26.4.1.app - name: Cache Xcode DerivedData uses: actions/cache@v5 diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index b1311b3..899d045 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -16,7 +16,7 @@ on: jobs: build: name: Build, Sign & Release - runs-on: macos-15 + runs-on: macos-26 timeout-minutes: 60 permissions: @@ -29,8 +29,11 @@ jobs: fetch-depth: 0 token: ${{ secrets.GITHUB_TOKEN }} - - name: Pin Xcode 16.4 - run: sudo xcode-select -s /Applications/Xcode_16.4.app + # Match ci.yml — Xcode 26.4.1 GA (Swift 6.3) tolerates the + # CIContext Sendable annotation in mlx-swift-lm's MLXVLM + # (Xcode 16.4 / Swift 6.0 fails with strict-concurrency error). + - name: Pin Xcode 26.4.1 + run: sudo xcode-select -s /Applications/Xcode_26.4.1.app - name: Show toolchain run: | diff --git a/CHANGELOG.md b/CHANGELOG.md index 8c7746e..5b5ea90 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -59,6 +59,31 @@ Versioning follows [Semantic Versioning](https://semver.org/). `macmlx serve` and `macmlx run`). Drop into Claude Desktop's `claude_desktop_config.json` as `{ "mcpServers": { "macmlx": { "command": "macmlx", "args": ["mcp", "serve"] } } }`. +- **VLM Engine** (v0.4.1, part 2 of 3). MLXSwiftEngine now branches + on `model.format` to load text-only models through + `MLXLLM.LLMModelFactory` and vision-language models through + `MLXVLM.VLMModelFactory`. Runtime modality stored in a new + `LoadedSupport` enum (`.none / .llm / .vlm`). + - `runGeneration(_:)` splits into `runLLMGeneration` (existing + prompt-cache flow — hot/cold KV tier, suffix prefill, save + extended cache after stream) and `runVLMGeneration` (fresh KV + cache per call; bypasses the prompt cache for now since + multimodal cache keys would need to fold image bytes into the + chained hash). + - `Chat.Message` mapping respects modality: VLM models receive + `ChatMessage.images` as `UserInput.Image.url(URL)` so the VLM's + `UserInputProcessor` can inject image tokens; LLM models drop + accidental attachments with a debug-level Pulse warning. + - `MLXVLM` added to `MacMLXCore` package dependencies (sibling + product of `MLXLLM` already in our `mlx-swift-lm` 3.31.x pin — + no new SPM dependency tree). + - Three new unit tests cover unsupported-format rejection + (gguf, unknown, missing-VLM-directory). 111/111 Core tests + green. Real VLM smoke (loading e.g. SmolVLM-Instruct-4bit) is + a manual-QA item — multi-GB download. + - Image picker, multimodal HTTP, and conversation persistence + land in the v0.4.1 part-3 PR. Plan: + `docs/superpowers/plans/2026-05-10-v0.4.1-vlm.md`. - **VLM Foundation** (v0.4.1, part 1 of 3). Pure-Swift Core changes for vision-language model support. No MLX integration yet, no UI, no HTTP changes. diff --git a/MacMLXCore/Package.swift b/MacMLXCore/Package.swift index 3735d4a..1c543bc 100644 --- a/MacMLXCore/Package.swift +++ b/MacMLXCore/Package.swift @@ -20,6 +20,7 @@ let package = Package( name: "MacMLXCore", dependencies: [ .product(name: "MLXLLM", package: "mlx-swift-lm"), + .product(name: "MLXVLM", package: "mlx-swift-lm"), .product(name: "MLXLMCommon", package: "mlx-swift-lm"), .product(name: "Transformers", package: "swift-transformers"), .product(name: "Pulse", package: "Pulse"), diff --git a/MacMLXCore/Sources/MacMLXCore/Engine/MLXSwiftEngine.swift b/MacMLXCore/Sources/MacMLXCore/Engine/MLXSwiftEngine.swift index 6b0eeab..f26cb23 100644 --- a/MacMLXCore/Sources/MacMLXCore/Engine/MLXSwiftEngine.swift +++ b/MacMLXCore/Sources/MacMLXCore/Engine/MLXSwiftEngine.swift @@ -2,6 +2,7 @@ import Foundation import MLX import MLXLLM import MLXLMCommon +import MLXVLM @preconcurrency import Tokenizers // MARK: - Sendable-box helpers @@ -99,11 +100,37 @@ public actor MLXSwiftEngine: InferenceEngine { // MARK: Private state - private var modelContainer: ModelContainer? + /// What's currently loaded — text-only LLM (`MLXLLM`), vision- + /// language VLM (`MLXVLM`), or nothing. Both modalities wrap a + /// `ModelContainer`; the case discriminates which factory built + /// it so generation can choose the right code path (LLM gets the + /// prompt cache; VLM bypasses it for now — multimodal cache keys + /// would need to fold image bytes into the hash, deferred to a + /// follow-up). + private enum LoadedSupport { + case none + case llm(ModelContainer) + case vlm(ModelContainer) + + var container: ModelContainer? { + switch self { + case .none: return nil + case .llm(let c): return c + case .vlm(let c): return c + } + } + + var isVLM: Bool { + if case .vlm = self { return true } + return false + } + } + + private var loadedSupport: LoadedSupport = .none /// Two-tier prompt cache (hot dict + cold safetensors sidecar). Used /// by `runGeneration` to reuse KV state across successive turns on - /// the same model. See `PromptCacheStore` for the tiering policy. + /// the same LLM. VLM generations bypass it. private let promptCacheStore: PromptCacheStore // MARK: Initialiser @@ -135,23 +162,51 @@ public actor MLXSwiftEngine: InferenceEngine { + "https://github.com/ml-explore/mlx-swift-lm/issues/219. " + "Use a dense Gemma 4 checkpoint (E2B / E4B) in the meantime." status = .error(reason) - modelContainer = nil + loadedSupport = .none loadedModel = nil throw EngineError.modelLoadFailed(reason: reason) } do { - let container = try await LLMModelFactory.shared.loadContainer( - from: model.directory, - using: HuggingFaceTokenizerLoader() - ) - modelContainer = container + let support: LoadedSupport + switch model.format { + case .mlx: + let container = try await LLMModelFactory.shared.loadContainer( + from: model.directory, + using: HuggingFaceTokenizerLoader() + ) + support = .llm(container) + + case .mlxVLM: + let container = try await VLMModelFactory.shared.loadContainer( + from: model.directory, + using: HuggingFaceTokenizerLoader() + ) + support = .vlm(container) + + case .gguf, .unknown: + // Surfaced via the Models tab — these formats never + // reach the engine in practice, but throw a clean + // error if someone hand-constructs a `LocalModel`. + let reason = "Unsupported model format: \(model.format.rawValue). " + + "MLXSwiftEngine handles `mlx` (text) and `mlxVLM` (vision-language) only." + status = .error(reason) + loadedSupport = .none + loadedModel = nil + throw EngineError.modelLoadFailed(reason: reason) + } + loadedSupport = support loadedModel = model status = .ready(model: model.id) + } catch let engineError as EngineError { + // Already shaped — preserve the typed error. + loadedSupport = .none + loadedModel = nil + throw engineError } catch { let reason = error.localizedDescription status = .error(reason) - modelContainer = nil + loadedSupport = .none loadedModel = nil throw EngineError.modelLoadFailed(reason: reason) } @@ -192,7 +247,7 @@ public actor MLXSwiftEngine: InferenceEngine { /// Release the loaded model from memory. public func unload() async throws { - modelContainer = nil + loadedSupport = .none loadedModel = nil status = .idle } @@ -255,7 +310,8 @@ public actor MLXSwiftEngine: InferenceEngine { _ request: GenerateRequest, into continuation: AsyncThrowingStream.Continuation ) async throws { - guard let container = modelContainer else { + let support = loadedSupport + guard let container = support.container else { continuation.finish(throwing: EngineError.modelNotLoaded) return } @@ -263,6 +319,7 @@ public actor MLXSwiftEngine: InferenceEngine { continuation.finish(throwing: EngineError.modelNotLoaded) return } + let isVLM = support.isVLM let params = request.parameters @@ -275,6 +332,11 @@ public actor MLXSwiftEngine: InferenceEngine { ) // Map our ChatMessage array to MLXLMCommon Chat.Message array. + // For VLM models, fold each message's `images` into the `Chat.Message` + // image bag — the VLM's `UserInputProcessor` injects image tokens at + // the right position when it builds the prompt. For LLM models we + // drop attachments with a debug-level warning so `[image attached]` + // stub strings don't sneak into the chat template. let chatMessages: [Chat.Message] = request.allMessages.map { msg in let role: Chat.Message.Role switch msg.role { @@ -282,7 +344,20 @@ public actor MLXSwiftEngine: InferenceEngine { case .assistant: role = .assistant case .system: role = .system } - return Chat.Message(role: role, content: msg.content) + if isVLM { + let images: [UserInput.Image] = msg.images.map { .url($0.fileURL) } + return Chat.Message(role: role, content: msg.content, images: images) + } else { + if !msg.images.isEmpty { + Task.detached { [count = msg.images.count] in + await LogManager.shared.debug( + "Dropping \(count) image attachment(s) on text-only model — load a VLM (Qwen-VL, Gemma-3, SmolVLM, …) to use images.", + category: .inference + ) + } + } + return Chat.Message(role: role, content: msg.content) + } } let userInput = UserInput(chat: chatMessages) @@ -308,10 +383,40 @@ public actor MLXSwiftEngine: InferenceEngine { throw EngineError.modelLoadFailed(reason: error.localizedDescription) } + if isVLM { + // VLM path: bypass the prompt cache (the cache key would + // need to fold image content hashes into the chained hash + // — deferred to a follow-up). + try await runVLMGeneration( + lmInput: lmInput, + container: container, + generateParams: generateParams, + into: continuation + ) + } else { + try await runLLMGeneration( + lmInput: lmInput, + container: container, + generateParams: generateParams, + modelID: loadedModelSnapshot.id, + into: continuation + ) + } + } + + /// Text-only path: tokenise, look up the prompt cache, prefill only + /// the new suffix, and stream tokens. Saves the extended cache back + /// to `promptCacheStore` once the stream completes. + private func runLLMGeneration( + lmInput: LMInput, + container: ModelContainer, + generateParams: GenerateParameters, + modelID: String, + into continuation: AsyncThrowingStream.Continuation + ) async throws { // Flat Int token array for key construction. `LMInput.text.tokens` // is an `MLXArray`; `asArray(Int.self)` materialises to Swift. let inputTokens = lmInput.text.tokens.asArray(Int.self) - let modelID = loadedModelSnapshot.id let priorKey = PromptCacheKey(modelID: modelID, tokens: inputTokens) // Try the store. On hit we reuse the restored cache; on miss we @@ -391,23 +496,75 @@ public actor MLXSwiftEngine: InferenceEngine { snapshot: PromptCacheSnapshot(workingCache) ) - // Emit the final chunk with usage + finish reason. - if let info = completionInfo { - let finishReason: FinishReason - switch info.stopReason { - case .length: - finishReason = .length - case .stop, .cancelled: - finishReason = .stop - } - let usage = TokenUsage( - promptTokens: info.promptTokenCount, - completionTokens: info.generationTokenCount + emitFinalChunk(completionInfo: completionInfo, into: continuation) + continuation.finish() + } + + /// Vision-language path: prepare the multimodal input (which already + /// includes processed image embeddings via the VLM's UserInputProcessor), + /// allocate a fresh KV cache, and stream tokens. Bypasses the prompt + /// cache — multimodal cache keys are a follow-up. + private func runVLMGeneration( + lmInput: LMInput, + container: ModelContainer, + generateParams: GenerateParameters, + into continuation: AsyncThrowingStream.Continuation + ) async throws { + let tokenizer = await container.tokenizer + let inputBox = NonSendableBox(lmInput) + + let stream: AsyncStream = try await container.perform(nonSendable: inputBox) { context, inputBox in + let cache = context.model.newCache(parameters: generateParams) + return try MLXLMCommon.generateTokens( + input: inputBox.value, + cache: cache, + parameters: generateParams, + context: context ) - let finalChunk = GenerateChunk(text: "", finishReason: finishReason, usage: usage) - continuation.yield(finalChunk) } + var detokenizer = NaiveStreamingDetokenizer(tokenizer: tokenizer) + var completionInfo: GenerateCompletionInfo? + + for await event in stream { + switch event { + case .token(let token): + detokenizer.append(token: token) + if let piece = detokenizer.next() { + let chunk = GenerateChunk(text: piece) + if case .terminated = continuation.yield(chunk) { + return + } + } + case .info(let info): + completionInfo = info + } + } + + emitFinalChunk(completionInfo: completionInfo, into: continuation) continuation.finish() } + + /// Shared "final chunk" emit (usage + finish reason). Both LLM and + /// VLM paths funnel through this so the wire-format chunk shape + /// stays identical. + private func emitFinalChunk( + completionInfo: GenerateCompletionInfo?, + into continuation: AsyncThrowingStream.Continuation + ) { + guard let info = completionInfo else { return } + let finishReason: FinishReason + switch info.stopReason { + case .length: + finishReason = .length + case .stop, .cancelled: + finishReason = .stop + } + let usage = TokenUsage( + promptTokens: info.promptTokenCount, + completionTokens: info.generationTokenCount + ) + let finalChunk = GenerateChunk(text: "", finishReason: finishReason, usage: usage) + continuation.yield(finalChunk) + } } diff --git a/MacMLXCore/Tests/MacMLXCoreTests/Engine/MLXSwiftEngineVLMTests.swift b/MacMLXCore/Tests/MacMLXCoreTests/Engine/MLXSwiftEngineVLMTests.swift new file mode 100644 index 0000000..f4b505c --- /dev/null +++ b/MacMLXCore/Tests/MacMLXCoreTests/Engine/MLXSwiftEngineVLMTests.swift @@ -0,0 +1,79 @@ +import Testing +import Foundation +@testable import MacMLXCore + +/// Engine-level guards that don't require a real MLX runtime to verify. +/// Loading a real VLM checkpoint needs Metal + a multi-GB download, so +/// the happy-path smoke test stays a manual-QA / integration item; what +/// we can assert here is that unsupported formats reject early and that +/// the typed-error shape matches the existing LLM path. +@Suite("MLXSwiftEngine VLM branch") +struct MLXSwiftEngineVLMTests { + + @Test + func loadFailsForGGUFFormat() async { + let engine = MLXSwiftEngine() + let model = LocalModel( + id: "fake-gguf", + displayName: "Fake GGUF", + directory: URL(fileURLWithPath: "/tmp/no-such-dir-gguf"), + sizeBytes: 0, + format: .gguf, + quantization: nil, + parameterCount: nil, + architecture: nil + ) + await #expect(throws: EngineError.self) { + try await engine.load(model) + } + let status = await engine.status + if case .error(let reason) = status { + #expect(reason.contains("Unsupported model format")) + #expect(reason.contains("gguf")) + } else { + Issue.record("Expected .error status, got \(status)") + } + } + + @Test + func loadFailsForUnknownFormat() async { + let engine = MLXSwiftEngine() + let model = LocalModel( + id: "fake-unknown", + displayName: "Fake Unknown", + directory: URL(fileURLWithPath: "/tmp/no-such-dir-unknown"), + sizeBytes: 0, + format: .unknown, + quantization: nil, + parameterCount: nil, + architecture: nil + ) + await #expect(throws: EngineError.self) { + try await engine.load(model) + } + } + + @Test + func loadVLMFromMissingDirectoryThrowsModelLoadFailed() async { + // VLMModelFactory hits the same "directory not found" failure + // as LLMModelFactory; we just need to confirm our load() routes + // through the VLM factory when format is .mlxVLM and surfaces + // the typed EngineError.modelLoadFailed. + let engine = MLXSwiftEngine() + let model = LocalModel( + id: "fake-vlm", + displayName: "Fake VLM", + directory: URL(fileURLWithPath: "/tmp/no-such-vlm-\(UUID().uuidString)"), + sizeBytes: 0, + format: .mlxVLM, + quantization: nil, + parameterCount: nil, + architecture: nil + ) + await #expect(throws: EngineError.self) { + try await engine.load(model) + } + let loaded = await engine.loadedModel + #expect(loaded == nil) + } +}