diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index c88f1eb..da82702 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -40,13 +40,17 @@ env:
 jobs:
   spm:
     name: SPM Build & Test
-    runs-on: macos-15
+    runs-on: macos-26
     timeout-minutes: 30
     steps:
       - uses: actions/checkout@v5
 
-      - name: Pin Xcode 16.4
-        run: sudo xcode-select -s /Applications/Xcode_16.4.app
+      # Xcode 26.4.1 ships Swift 6.3, which (unlike 6.0 in Xcode 16.4)
+      # tolerates mlx-swift-lm's `private let context = CIContext()` in
+      # MLXVLM/MediaProcessing.swift under strict concurrency. Pinning to
+      # the GA build (not the 26.5 beta) so toolchain churn stays low.
+      - name: Pin Xcode 26.4.1
+        run: sudo xcode-select -s /Applications/Xcode_26.4.1.app
 
       - name: Cache SPM checkouts + build
         uses: actions/cache@v5
@@ -81,13 +85,17 @@ jobs:
 
   app:
     name: Xcode App Build
-    runs-on: macos-15
+    runs-on: macos-26
     timeout-minutes: 30
     steps:
       - uses: actions/checkout@v5
 
-      - name: Pin Xcode 16.4
-        run: sudo xcode-select -s /Applications/Xcode_16.4.app
+      # Xcode 26.4.1 ships Swift 6.3, which (unlike 6.0 in Xcode 16.4)
+      # tolerates mlx-swift-lm's `private let context = CIContext()` in
+      # MLXVLM/MediaProcessing.swift under strict concurrency. Pinning to
+      # the GA build (not the 26.5 beta) so toolchain churn stays low.
+      - name: Pin Xcode 26.4.1
+        run: sudo xcode-select -s /Applications/Xcode_26.4.1.app
 
       - name: Cache Xcode DerivedData
         uses: actions/cache@v5
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index b1311b3..899d045 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -16,7 +16,7 @@ on:
 jobs:
   build:
     name: Build, Sign & Release
-    runs-on: macos-15
+    runs-on: macos-26
     timeout-minutes: 60
 
     permissions:
@@ -29,8 +29,11 @@ jobs:
           fetch-depth: 0
           token: ${{ secrets.GITHUB_TOKEN }}
 
-      - name: Pin Xcode 16.4
-        run: sudo xcode-select -s /Applications/Xcode_16.4.app
+      # Match ci.yml — Xcode 26.4.1 GA (Swift 6.3) tolerates the
+      # CIContext Sendable annotation in mlx-swift-lm's MLXVLM
+      # (Xcode 16.4 / Swift 6.0 fails with strict-concurrency error).
+      - name: Pin Xcode 26.4.1
+        run: sudo xcode-select -s /Applications/Xcode_26.4.1.app
 
       - name: Show toolchain
         run: |
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8c7746e..5b5ea90 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -59,6 +59,31 @@ Versioning follows [Semantic Versioning](https://semver.org/).
   `macmlx serve` and `macmlx run`). Drop into Claude Desktop's
   `claude_desktop_config.json` as
   `{ "mcpServers": { "macmlx": { "command": "macmlx", "args": ["mcp", "serve"] } } }`.
+- **VLM Engine** (v0.4.1, part 2 of 3). MLXSwiftEngine now branches
+  on `model.format` to load text-only models through
+  `MLXLLM.LLMModelFactory` and vision-language models through
+  `MLXVLM.VLMModelFactory`. Runtime modality stored in a new
+  `LoadedSupport` enum (`.none / .llm / .vlm`).
+  - `runGeneration(_:)` splits into `runLLMGeneration` (existing
+    prompt-cache flow — hot/cold KV tier, suffix prefill, save
+    extended cache after stream) and `runVLMGeneration` (fresh KV
+    cache per call; bypasses the prompt cache for now since
+    multimodal cache keys would need to fold image bytes into the
+    chained hash).
+  - `Chat.Message` mapping respects modality: VLM models receive
+    `ChatMessage.images` as `UserInput.Image.url(URL)` so the VLM's
+    `UserInputProcessor` can inject image tokens; LLM models drop
+    accidental attachments with a debug-level Pulse warning.
+  - `MLXVLM` added to `MacMLXCore` package dependencies (sibling
+    product of `MLXLLM` already in our `mlx-swift-lm` 3.31.x pin —
+    no new SPM dependency tree).
+  - Three new unit tests cover unsupported-format rejection
+    (gguf, unknown, missing-VLM-directory). 111/111 Core tests
+    green. Real VLM smoke (loading e.g. SmolVLM-Instruct-4bit) is
+    a manual-QA item — multi-GB download.
+  - Image picker, multimodal HTTP, and conversation persistence
+    land in the v0.4.1 part-3 PR. Plan:
+    `docs/superpowers/plans/2026-05-10-v0.4.1-vlm.md`.
 - **VLM Foundation** (v0.4.1, part 1 of 3). Pure-Swift Core changes
   for vision-language model support. No MLX integration yet, no UI,
   no HTTP changes.
diff --git a/MacMLXCore/Package.swift b/MacMLXCore/Package.swift
index 3735d4a..1c543bc 100644
--- a/MacMLXCore/Package.swift
+++ b/MacMLXCore/Package.swift
@@ -20,6 +20,7 @@ let package = Package(
             name: "MacMLXCore",
             dependencies: [
                 .product(name: "MLXLLM", package: "mlx-swift-lm"),
+                .product(name: "MLXVLM", package: "mlx-swift-lm"),
                 .product(name: "MLXLMCommon", package: "mlx-swift-lm"),
                 .product(name: "Transformers", package: "swift-transformers"),
                 .product(name: "Pulse", package: "Pulse"),
diff --git a/MacMLXCore/Sources/MacMLXCore/Engine/MLXSwiftEngine.swift b/MacMLXCore/Sources/MacMLXCore/Engine/MLXSwiftEngine.swift
index 6b0eeab..f26cb23 100644
--- a/MacMLXCore/Sources/MacMLXCore/Engine/MLXSwiftEngine.swift
+++ b/MacMLXCore/Sources/MacMLXCore/Engine/MLXSwiftEngine.swift
@@ -2,6 +2,7 @@ import Foundation
 import MLX
 import MLXLLM
 import MLXLMCommon
+import MLXVLM
 @preconcurrency import Tokenizers
 
 // MARK: - Sendable-box helpers
@@ -99,11 +100,37 @@ public actor MLXSwiftEngine: InferenceEngine {
 
     // MARK: Private state
 
-    private var modelContainer: ModelContainer?
+    /// What's currently loaded — text-only LLM (`MLXLLM`), vision-
+    /// language VLM (`MLXVLM`), or nothing. Both modalities wrap a
+    /// `ModelContainer`; the case discriminates which factory built
+    /// it so generation can choose the right code path (LLM gets the
+    /// prompt cache; VLM bypasses it for now — multimodal cache keys
+    /// would need to fold image bytes into the hash, deferred to a
+    /// follow-up).
+    private enum LoadedSupport {
+        case none
+        case llm(ModelContainer)
+        case vlm(ModelContainer)
+
+        var container: ModelContainer? {
+            switch self {
+            case .none: return nil
+            case .llm(let c): return c
+            case .vlm(let c): return c
+            }
+        }
+
+        var isVLM: Bool {
+            if case .vlm = self { return true }
+            return false
+        }
+    }
+
+    private var loadedSupport: LoadedSupport = .none
 
     /// Two-tier prompt cache (hot dict + cold safetensors sidecar). Used
     /// by `runGeneration` to reuse KV state across successive turns on
-    /// the same model. See `PromptCacheStore` for the tiering policy.
+    /// the same LLM. VLM generations bypass it.
     private let promptCacheStore: PromptCacheStore
 
     // MARK: Initialiser
@@ -135,23 +162,51 @@ public actor MLXSwiftEngine: InferenceEngine {
                 + "https://github.com/ml-explore/mlx-swift-lm/issues/219. "
                 + "Use a dense Gemma 4 checkpoint (E2B / E4B) in the meantime."
             status = .error(reason)
-            modelContainer = nil
+            loadedSupport = .none
             loadedModel = nil
             throw EngineError.modelLoadFailed(reason: reason)
         }
 
         do {
-            let container = try await LLMModelFactory.shared.loadContainer(
-                from: model.directory,
-                using: HuggingFaceTokenizerLoader()
-            )
-            modelContainer = container
+            let support: LoadedSupport
+            switch model.format {
+            case .mlx:
+                let container = try await LLMModelFactory.shared.loadContainer(
+                    from: model.directory,
+                    using: HuggingFaceTokenizerLoader()
+                )
+                support = .llm(container)
+
+            case .mlxVLM:
+                let container = try await VLMModelFactory.shared.loadContainer(
+                    from: model.directory,
+                    using: HuggingFaceTokenizerLoader()
+                )
+                support = .vlm(container)
+
+            case .gguf, .unknown:
+                // Surfaced via the Models tab — these formats never
+                // reach the engine in practice, but throw a clean
+                // error if someone hand-constructs a `LocalModel`.
+                let reason = "Unsupported model format: \(model.format.rawValue). " +
+                    "MLXSwiftEngine handles `mlx` (text) and `mlxVLM` (vision-language) only."
+                status = .error(reason)
+                loadedSupport = .none
+                loadedModel = nil
+                throw EngineError.modelLoadFailed(reason: reason)
+            }
+            loadedSupport = support
             loadedModel = model
             status = .ready(model: model.id)
+        } catch let engineError as EngineError {
+            // Already shaped — preserve the typed error.
+            loadedSupport = .none
+            loadedModel = nil
+            throw engineError
         } catch {
             let reason = error.localizedDescription
             status = .error(reason)
-            modelContainer = nil
+            loadedSupport = .none
             loadedModel = nil
             throw EngineError.modelLoadFailed(reason: reason)
         }
@@ -192,7 +247,7 @@ public actor MLXSwiftEngine: InferenceEngine {
 
     /// Release the loaded model from memory.
     public func unload() async throws {
-        modelContainer = nil
+        loadedSupport = .none
         loadedModel = nil
         status = .idle
     }
@@ -255,7 +310,8 @@ public actor MLXSwiftEngine: InferenceEngine {
         _ request: GenerateRequest,
         into continuation: AsyncThrowingStream<GenerateChunk, Error>.Continuation
     ) async throws {
-        guard let container = modelContainer else {
+        let support = loadedSupport
+        guard let container = support.container else {
             continuation.finish(throwing: EngineError.modelNotLoaded)
             return
         }
@@ -263,6 +319,7 @@ public actor MLXSwiftEngine: InferenceEngine {
             continuation.finish(throwing: EngineError.modelNotLoaded)
             return
         }
+        let isVLM = support.isVLM
 
         let params = request.parameters
 
@@ -275,6 +332,11 @@ public actor MLXSwiftEngine: InferenceEngine {
         )
 
         // Map our ChatMessage array to MLXLMCommon Chat.Message array.
+        // For VLM models, fold each message's `images` into the `Chat.Message`
+        // image bag — the VLM's `UserInputProcessor` injects image tokens at
+        // the right position when it builds the prompt. For LLM models we
+        // drop attachments with a debug-level warning so `[image attached]`
+        // stub strings don't sneak into the chat template.
         let chatMessages: [Chat.Message] = request.allMessages.map { msg in
             let role: Chat.Message.Role
             switch msg.role {
@@ -282,7 +344,20 @@ public actor MLXSwiftEngine: InferenceEngine {
             case .assistant: role = .assistant
             case .system:    role = .system
             }
-            return Chat.Message(role: role, content: msg.content)
+            if isVLM {
+                let images: [UserInput.Image] = msg.images.map { .url($0.fileURL) }
+                return Chat.Message(role: role, content: msg.content, images: images)
+            } else {
+                if !msg.images.isEmpty {
+                    Task.detached { [count = msg.images.count] in
+                        await LogManager.shared.debug(
+                            "Dropping \(count) image attachment(s) on text-only model — load a VLM (Qwen-VL, Gemma-3, SmolVLM, …) to use images.",
+                            category: .inference
+                        )
+                    }
+                }
+                return Chat.Message(role: role, content: msg.content)
+            }
         }
 
         let userInput = UserInput(chat: chatMessages)
@@ -308,10 +383,40 @@ public actor MLXSwiftEngine: InferenceEngine {
             throw EngineError.modelLoadFailed(reason: error.localizedDescription)
         }
 
+        if isVLM {
+            // VLM path: bypass the prompt cache (the cache key would
+            // need to fold image content hashes into the chained hash
+            // — deferred to a follow-up).
+            try await runVLMGeneration(
+                lmInput: lmInput,
+                container: container,
+                generateParams: generateParams,
+                into: continuation
+            )
+        } else {
+            try await runLLMGeneration(
+                lmInput: lmInput,
+                container: container,
+                generateParams: generateParams,
+                modelID: loadedModelSnapshot.id,
+                into: continuation
+            )
+        }
+    }
+
+    /// Text-only path: tokenise, look up the prompt cache, prefill only
+    /// the new suffix, and stream tokens. Saves the extended cache back
+    /// to `promptCacheStore` once the stream completes.
+    private func runLLMGeneration(
+        lmInput: LMInput,
+        container: ModelContainer,
+        generateParams: GenerateParameters,
+        modelID: String,
+        into continuation: AsyncThrowingStream<GenerateChunk, Error>.Continuation
+    ) async throws {
         // Flat Int token array for key construction. `LMInput.text.tokens`
         // is an `MLXArray`; `asArray(Int.self)` materialises to Swift.
         let inputTokens = lmInput.text.tokens.asArray(Int.self)
-        let modelID = loadedModelSnapshot.id
         let priorKey = PromptCacheKey(modelID: modelID, tokens: inputTokens)
 
         // Try the store. On hit we reuse the restored cache; on miss we
@@ -391,23 +496,75 @@ public actor MLXSwiftEngine: InferenceEngine {
             snapshot: PromptCacheSnapshot(workingCache)
         )
 
-        // Emit the final chunk with usage + finish reason.
-        if let info = completionInfo {
-            let finishReason: FinishReason
-            switch info.stopReason {
-            case .length:
-                finishReason = .length
-            case .stop, .cancelled:
-                finishReason = .stop
-            }
-            let usage = TokenUsage(
-                promptTokens: info.promptTokenCount,
-                completionTokens: info.generationTokenCount
+        emitFinalChunk(completionInfo: completionInfo, into: continuation)
+        continuation.finish()
+    }
+
+    /// Vision-language path: prepare the multimodal input (which already
+    /// includes processed image embeddings via the VLM's UserInputProcessor),
+    /// allocate a fresh KV cache, and stream tokens. Bypasses the prompt
+    /// cache — multimodal cache keys are a follow-up.
+    private func runVLMGeneration(
+        lmInput: LMInput,
+        container: ModelContainer,
+        generateParams: GenerateParameters,
+        into continuation: AsyncThrowingStream<GenerateChunk, Error>.Continuation
+    ) async throws {
+        let tokenizer = await container.tokenizer
+        let inputBox = NonSendableBox(lmInput)
+
+        let stream: AsyncStream<TokenGeneration> = try await container.perform(nonSendable: inputBox) { context, inputBox in
+            let cache = context.model.newCache(parameters: generateParams)
+            return try MLXLMCommon.generateTokens(
+                input: inputBox.value,
+                cache: cache,
+                parameters: generateParams,
+                context: context
             )
-            let finalChunk = GenerateChunk(text: "", finishReason: finishReason, usage: usage)
-            continuation.yield(finalChunk)
         }
 
+        var detokenizer = NaiveStreamingDetokenizer(tokenizer: tokenizer)
+        var completionInfo: GenerateCompletionInfo?
+
+        for await event in stream {
+            switch event {
+            case .token(let token):
+                detokenizer.append(token: token)
+                if let piece = detokenizer.next() {
+                    let chunk = GenerateChunk(text: piece)
+                    if case .terminated = continuation.yield(chunk) {
+                        return
+                    }
+                }
+            case .info(let info):
+                completionInfo = info
+            }
+        }
+
+        emitFinalChunk(completionInfo: completionInfo, into: continuation)
         continuation.finish()
     }
+
+    /// Shared "final chunk" emit (usage + finish reason). Both LLM and
+    /// VLM paths funnel through this so the wire-format chunk shape
+    /// stays identical.
+    private func emitFinalChunk(
+        completionInfo: GenerateCompletionInfo?,
+        into continuation: AsyncThrowingStream<GenerateChunk, Error>.Continuation
+    ) {
+        guard let info = completionInfo else { return }
+        let finishReason: FinishReason
+        switch info.stopReason {
+        case .length:
+            finishReason = .length
+        case .stop, .cancelled:
+            finishReason = .stop
+        }
+        let usage = TokenUsage(
+            promptTokens: info.promptTokenCount,
+            completionTokens: info.generationTokenCount
+        )
+        let finalChunk = GenerateChunk(text: "", finishReason: finishReason, usage: usage)
+        continuation.yield(finalChunk)
+    }
 }
diff --git a/MacMLXCore/Tests/MacMLXCoreTests/Engine/MLXSwiftEngineVLMTests.swift b/MacMLXCore/Tests/MacMLXCoreTests/Engine/MLXSwiftEngineVLMTests.swift
new file mode 100644
index 0000000..f4b505c
--- /dev/null
+++ b/MacMLXCore/Tests/MacMLXCoreTests/Engine/MLXSwiftEngineVLMTests.swift
@@ -0,0 +1,79 @@
+import Testing
+import Foundation
+@testable import MacMLXCore
+
+/// Engine-level guards that don't require a real MLX runtime to verify.
+/// Loading a real VLM checkpoint needs Metal + a multi-GB download, so
+/// the happy-path smoke test stays a manual-QA / integration item; what
+/// we can assert here is that unsupported formats reject early and that
+/// the typed-error shape matches the existing LLM path.
+@Suite("MLXSwiftEngine VLM branch")
+struct MLXSwiftEngineVLMTests {
+
+    @Test
+    func loadFailsForGGUFFormat() async {
+        let engine = MLXSwiftEngine()
+        let model = LocalModel(
+            id: "fake-gguf",
+            displayName: "Fake GGUF",
+            directory: URL(fileURLWithPath: "/tmp/no-such-dir-gguf"),
+            sizeBytes: 0,
+            format: .gguf,
+            quantization: nil,
+            parameterCount: nil,
+            architecture: nil
+        )
+        await #expect(throws: EngineError.self) {
+            try await engine.load(model)
+        }
+        let status = await engine.status
+        if case .error(let reason) = status {
+            #expect(reason.contains("Unsupported model format"))
+            #expect(reason.contains("gguf"))
+        } else {
+            Issue.record("Expected .error status, got \(status)")
+        }
+    }
+
+    @Test
+    func loadFailsForUnknownFormat() async {
+        let engine = MLXSwiftEngine()
+        let model = LocalModel(
+            id: "fake-unknown",
+            displayName: "Fake Unknown",
+            directory: URL(fileURLWithPath: "/tmp/no-such-dir-unknown"),
+            sizeBytes: 0,
+            format: .unknown,
+            quantization: nil,
+            parameterCount: nil,
+            architecture: nil
+        )
+        await #expect(throws: EngineError.self) {
+            try await engine.load(model)
+        }
+    }
+
+    @Test
+    func loadVLMFromMissingDirectoryThrowsModelLoadFailed() async {
+        // VLMModelFactory hits the same "directory not found" failure
+        // as LLMModelFactory; we just need to confirm our load() routes
+        // through the VLM factory when format is .mlxVLM and surfaces
+        // the typed EngineError.modelLoadFailed.
+        let engine = MLXSwiftEngine()
+        let model = LocalModel(
+            id: "fake-vlm",
+            displayName: "Fake VLM",
+            directory: URL(fileURLWithPath: "/tmp/no-such-vlm-\(UUID().uuidString)"),
+            sizeBytes: 0,
+            format: .mlxVLM,
+            quantization: nil,
+            parameterCount: nil,
+            architecture: nil
+        )
+        await #expect(throws: EngineError.self) {
+            try await engine.load(model)
+        }
+        let loaded = await engine.loadedModel
+        #expect(loaded == nil)
+    }
+}