From 16bfc171e94b610ad67ed27e8de01299ad6f537b Mon Sep 17 00:00:00 2001 From: Kefeng Zhou Date: Sun, 10 May 2026 13:01:53 +0700 Subject: [PATCH 1/4] feat(vlm): ConversationStore image persistence MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - StoredMessage.images: [ImageAttachment] mirrors ChatMessage.images added in PR #33. Custom decoder defaults to empty when key absent — pre-v0.4.1 conversation JSON loads unchanged. - save(_:) internalises image URLs: any attachment outside the conversation's own images dir gets copied to //images/., then the URL is rewritten to point there. Best-effort: copy failure logs to stderr and falls through with the original URL preserved. - delete(id:) tears down both the JSON sidecar and the per- conversation directory (recursive remove). Pre-v0.4.1 conversations with no per-dir no-op cleanly. - 4 new tests in ConversationStoreImagesTests (.serialized for tmpdir safety): external-image copy, idempotent internal-URL preservation, delete-tears-down-conv-dir, legacy-JSON-decode. 115/115 Core green. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../Managers/ConversationStore.swift | 95 +++++++++++- .../ConversationStoreImagesTests.swift | 141 ++++++++++++++++++ 2 files changed, 234 insertions(+), 2 deletions(-) create mode 100644 MacMLXCore/Tests/MacMLXCoreTests/Managers/ConversationStoreImagesTests.swift diff --git a/MacMLXCore/Sources/MacMLXCore/Managers/ConversationStore.swift b/MacMLXCore/Sources/MacMLXCore/Managers/ConversationStore.swift index cec5316..9bc64b6 100644 --- a/MacMLXCore/Sources/MacMLXCore/Managers/ConversationStore.swift +++ b/MacMLXCore/Sources/MacMLXCore/Managers/ConversationStore.swift @@ -26,19 +26,42 @@ public struct StoredMessage: Codable, Hashable, Identifiable, Sendable { public var content: String public let timestamp: Date public var tokenCount: Int? + /// Image attachments tied to this turn. Empty for text-only — the + /// common case. URLs point into + /// `//images/...` once the conversation + /// has been saved (see `ConversationStore.save(_:)`). + /// Backwards-compatible: pre-v0.4.1 JSON without an `images` key + /// decodes with an empty array. + public var images: [ImageAttachment] public init( id: UUID = UUID(), role: MessageRole, content: String, timestamp: Date = Date(), - tokenCount: Int? = nil + tokenCount: Int? = nil, + images: [ImageAttachment] = [] ) { self.id = id self.role = role self.content = content self.timestamp = timestamp self.tokenCount = tokenCount + self.images = images + } + + private enum CodingKeys: String, CodingKey { + case id, role, content, timestamp, tokenCount, images + } + + public init(from decoder: Decoder) throws { + let c = try decoder.container(keyedBy: CodingKeys.self) + self.id = try c.decode(UUID.self, forKey: .id) + self.role = try c.decode(MessageRole.self, forKey: .role) + self.content = try c.decode(String.self, forKey: .content) + self.timestamp = try c.decode(Date.self, forKey: .timestamp) + self.tokenCount = try c.decodeIfPresent(Int.self, forKey: .tokenCount) + self.images = try c.decodeIfPresent([ImageAttachment].self, forKey: .images) ?? [] } } @@ -109,6 +132,16 @@ public actor ConversationStore { /// Persist `conversation` to disk atomically. Creates the directory if /// missing. Bumps `updatedAt` to "now" before writing. /// + /// Image attachments referenced by any message get copied (best- + /// effort) into `//images/.` + /// the first time we see them, so the saved JSON URLs are stable + /// across user moves of the picked file. Already-internal URLs + /// (already pointing at the conversation's images dir) are left + /// in place. A copy failure logs to stderr and falls through — + /// the conversation still saves with the original URL, which the + /// reader will tolerate (image just won't load if the source + /// disappears). + /// /// Uses `JSONCoding.precisionEncoder` so rapid saves produce distinct /// `updatedAt` values for `list()` sort stability. Decoder accepts /// pre-v0.3 ISO-8601-string files for backward compat. @@ -120,11 +153,49 @@ public actor ConversationStore { if copy.title == "New Chat" { copy.title = copy.derivedTitle } + copy.messages = copy.messages.map { internaliseImages(of: $0, conversationID: copy.id) } + let data = try JSONCoding.precisionEncoder().encode(copy) let url = fileURL(for: copy.id) try data.write(to: url, options: .atomic) } + /// Copy any image attachments that live outside the conversation's + /// own `images/` directory into it, and rewrite the URLs. Idempotent: + /// images already inside the conversation directory are kept verbatim. + private func internaliseImages( + of message: StoredMessage, + conversationID: UUID + ) -> StoredMessage { + guard !message.images.isEmpty else { return message } + let imagesDir = imagesDirectory(for: conversationID) + + let updated: [ImageAttachment] = message.images.map { att in + // Already internal? — leave it alone. + if att.fileURL.path.hasPrefix(imagesDir.path) { + return att + } + // Try to copy. On any error, fall through to the original + // attachment so the save still succeeds. + do { + try fileManager.createDirectory(at: imagesDir, withIntermediateDirectories: true) + let ext = att.fileURL.pathExtension.isEmpty ? "img" : att.fileURL.pathExtension + let dest = imagesDir.appending( + path: "\(UUID().uuidString).\(ext)", directoryHint: .notDirectory) + try fileManager.copyItem(at: att.fileURL, to: dest) + return ImageAttachment(fileURL: dest, mimeType: att.mimeType) + } catch { + FileHandle.standardError.write(Data( + "[ConversationStore] image copy failed for \(att.fileURL.path): \(error)\n".utf8 + )) + return att + } + } + var out = message + out.images = updated + return out + } + /// Return the most-recently-updated conversation, or nil if the store /// is empty. Corrupt files are skipped (they don't block other loads). public func loadLatest() async throws -> Conversation? { @@ -170,10 +241,16 @@ public actor ConversationStore { return loaded.sorted { $0.updatedAt > $1.updatedAt } } - /// Remove a conversation from disk. Idempotent — no error if missing. + /// Remove a conversation from disk along with any internalised + /// image attachments. Idempotent — no error if missing. public func delete(id: UUID) async throws { let url = fileURL(for: id) try? fileManager.removeItem(at: url) + // Also remove the per-conversation directory if it exists + // (images live under it; pre-v0.4.1 conversations didn't + // create one and this no-ops cleanly). + let convDir = conversationDirectory(for: id) + try? fileManager.removeItem(at: convDir) } // MARK: - Private @@ -182,6 +259,20 @@ public actor ConversationStore { directory.appending(path: "\(id.uuidString).json", directoryHint: .notDirectory) } + /// Per-conversation directory holding sidecar resources (images, + /// future audio attachments). Created on demand in + /// `internaliseImages(of:conversationID:)` and torn down by + /// `delete(id:)`. + private func conversationDirectory(for id: UUID) -> URL { + directory.appending(path: id.uuidString, directoryHint: .isDirectory) + } + + /// Path used for image attachments of a given conversation. + private func imagesDirectory(for id: UUID) -> URL { + conversationDirectory(for: id) + .appending(path: "images", directoryHint: .isDirectory) + } + private func ensureDirectory() throws { if !fileManager.fileExists(atPath: directory.path) { try fileManager.createDirectory(at: directory, withIntermediateDirectories: true) diff --git a/MacMLXCore/Tests/MacMLXCoreTests/Managers/ConversationStoreImagesTests.swift b/MacMLXCore/Tests/MacMLXCoreTests/Managers/ConversationStoreImagesTests.swift new file mode 100644 index 0000000..0df8ba6 --- /dev/null +++ b/MacMLXCore/Tests/MacMLXCoreTests/Managers/ConversationStoreImagesTests.swift @@ -0,0 +1,141 @@ +import Testing +import Foundation +@testable import MacMLXCore + +/// Filesystem-backed: serialised so swift-testing's parallel executor +/// doesn't thrash on the temp directory. +@Suite("ConversationStore — images persistence", .serialized) +struct ConversationStoreImagesTests { + + @Test + func saveCopiesExternalImageIntoConversationDir() async throws { + let temp = try TempDir() + let store = ConversationStore(directory: temp.url) + + // Lay down a "user-picked" image outside the store. + let pickedDir = temp.url.appendingPathComponent("picks", isDirectory: true) + try FileManager.default.createDirectory(at: pickedDir, withIntermediateDirectories: true) + let pickedURL = pickedDir.appendingPathComponent("cat.jpg", isDirectory: false) + try Data("fake-bytes".utf8).write(to: pickedURL) + + let conv = Conversation( + messages: [ + StoredMessage( + role: .user, + content: "Look at this", + images: [ImageAttachment(fileURL: pickedURL, mimeType: "image/jpeg")] + ) + ] + ) + + try await store.save(conv) + + // Reload; image URL must point inside the per-conversation dir. + let listed = try await store.list() + #expect(listed.count == 1) + let reloaded = try #require(listed.first) + #expect(reloaded.messages.first?.images.count == 1) + let savedURL = try #require(reloaded.messages.first?.images.first?.fileURL) + let convImagesPrefix = temp.url + .appendingPathComponent(conv.id.uuidString, isDirectory: true) + .appendingPathComponent("images", isDirectory: true) + .path + #expect(savedURL.path.hasPrefix(convImagesPrefix), "saved URL must be inside conversation images dir; got \(savedURL.path)") + // Bytes survived the copy. + let bytes = try Data(contentsOf: savedURL) + #expect(bytes == Data("fake-bytes".utf8)) + } + + @Test + func saveLeavesAlreadyInternalImageURLAlone() async throws { + // If the URL already points into the conversation's images dir, + // the second save must not re-copy + rename it. + let temp = try TempDir() + let store = ConversationStore(directory: temp.url) + let convID = UUID() + let imagesDir = temp.url + .appendingPathComponent(convID.uuidString, isDirectory: true) + .appendingPathComponent("images", isDirectory: true) + try FileManager.default.createDirectory(at: imagesDir, withIntermediateDirectories: true) + let internalURL = imagesDir.appendingPathComponent("already-here.jpg", isDirectory: false) + try Data("internal".utf8).write(to: internalURL) + + let conv = Conversation( + id: convID, + messages: [ + StoredMessage( + role: .user, + content: "Look", + images: [ImageAttachment(fileURL: internalURL, mimeType: "image/jpeg")] + ) + ] + ) + try await store.save(conv) + + let listed = try await store.list() + let reloaded = try #require(listed.first) + let savedURL = try #require(reloaded.messages.first?.images.first?.fileURL) + #expect(savedURL == internalURL, "internal URL should be preserved verbatim") + } + + @Test + func deleteRemovesConversationImagesDir() async throws { + let temp = try TempDir() + let store = ConversationStore(directory: temp.url) + let pickedDir = temp.url.appendingPathComponent("picks", isDirectory: true) + try FileManager.default.createDirectory(at: pickedDir, withIntermediateDirectories: true) + let pickedURL = pickedDir.appendingPathComponent("cat.png", isDirectory: false) + try Data("png-bytes".utf8).write(to: pickedURL) + + let conv = Conversation( + messages: [ + StoredMessage( + role: .user, + content: "x", + images: [ImageAttachment(fileURL: pickedURL, mimeType: "image/png")] + ) + ] + ) + try await store.save(conv) + + // Confirm the conv directory exists. + let convDir = temp.url.appendingPathComponent(conv.id.uuidString, isDirectory: true) + #expect(FileManager.default.fileExists(atPath: convDir.path)) + + try await store.delete(id: conv.id) + + #expect(!FileManager.default.fileExists(atPath: convDir.path), "conversation dir must be removed") + // JSON sidecar gone too. + let jsonURL = temp.url.appendingPathComponent("\(conv.id.uuidString).json", isDirectory: false) + #expect(!FileManager.default.fileExists(atPath: jsonURL.path)) + } + + @Test + func legacyStoredMessageJSONWithoutImagesDecodesWithEmptyArray() throws { + let legacy = """ + { + "id": "1FAA0000-0000-0000-0000-000000000001", + "role": "user", + "content": "Hello", + "timestamp": 1700000000.0 + } + """ + let data = Data(legacy.utf8) + let decoder = JSONCoding.tolerantDecoder() + let decoded = try decoder.decode(StoredMessage.self, from: data) + #expect(decoded.images.isEmpty) + #expect(decoded.role == .user) + } +} + +/// Auto-cleaning scratch dir used by the filesystem-backed tests. +private struct TempDir { + let url: URL + + init() throws { + let base = FileManager.default.temporaryDirectory + .appendingPathComponent("macmlx-conv-image-tests-\(UUID().uuidString)", isDirectory: true) + try FileManager.default.createDirectory(at: base, withIntermediateDirectories: true) + self.url = base + } +} From 5c4d9c14cfaff2932d4dc7360f9929cd0169b5ec Mon Sep 17 00:00:00 2001 From: Kefeng Zhou Date: Sun, 10 May 2026 13:05:42 +0700 Subject: [PATCH 2/4] feat(vlm): chat-input image picker + inline thumbnails MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - UIChatMessage gains images: [ImageAttachment] mirroring StoredMessage + Core ChatMessage. Hydrated from stored messages on conversation reload; stripped to images: [] on send when isn't a VLM. - ChatViewModel.attachedImages staging bag; canAttachImages / attachImage(at:) / removeAttachedImage(at:) / clearAttachedImages() helpers wired to the input view and to the model-modality gate (ChatViewModel.canAttachImages == coordinator.currentModel.format == .mlxVLM). send() picks the bag up + clears it. generate()'s ChatMessage map now passes images through to Core / engine. - ChatInputView gets a horizontal thumbnail strip above the text field, a paperclip button driving SwiftUI .fileImporter (image UTTypes only — png/jpeg/webp/gif/heic/bmp), and an enabled-state gate with explanatory tooltip when the loaded model is text-only. Send button now also enables when the user has staged images but no text (image-only ask is legitimate on a VLM). - ChatMessageView renders an inline LazyVGrid of 96pt thumbnails above the bubble for any message that has attachments. Click a thumbnail to open the file in Preview via NSWorkspace.shared.open. - AsyncThumbnailImage helper (NSImage-backed) lives next to ChatInputView and is reused by ChatMessageView. Local Xcode App Build green; 115/115 Core tests still green. Co-Authored-By: Claude Opus 4.7 (1M context) --- macMLX/macMLX/Views/Chat/ChatInputView.swift | 136 +++++++++++++++--- .../macMLX/Views/Chat/ChatMessageView.swift | 31 ++++ macMLX/macMLX/Views/Chat/ChatView.swift | 2 + macMLX/macMLX/Views/Chat/ChatViewModel.swift | 55 ++++++- 4 files changed, 202 insertions(+), 22 deletions(-) diff --git a/macMLX/macMLX/Views/Chat/ChatInputView.swift b/macMLX/macMLX/Views/Chat/ChatInputView.swift index 40de2c2..d9b0db4 100644 --- a/macMLX/macMLX/Views/Chat/ChatInputView.swift +++ b/macMLX/macMLX/Views/Chat/ChatInputView.swift @@ -2,17 +2,109 @@ // macMLX import SwiftUI +import MacMLXCore +import UniformTypeIdentifiers struct ChatInputView: View { @Binding var text: String + /// VLM image attachments staged for the next user message. + @Binding var attachedImages: [ImageAttachment] let isGenerating: Bool let isModelLoaded: Bool + /// True when the loaded model accepts images (VLM). Drives the + /// paperclip button's enabled state. + let canAttachImages: Bool let onSend: () -> Void let onStop: () -> Void + @State private var isFileImporterPresented = false + var body: some View { + VStack(alignment: .leading, spacing: 6) { + if !attachedImages.isEmpty { + thumbnailStrip + } + inputRow + } + .padding(.horizontal, 12) + .padding(.vertical, 8) + .background( + RoundedRectangle(cornerRadius: 14) + .stroke(Color.secondary.opacity(0.3), lineWidth: 1) + .background(.background, in: RoundedRectangle(cornerRadius: 14)) + ) + .padding(.horizontal, 16) + .padding(.vertical, 8) + .fileImporter( + isPresented: $isFileImporterPresented, + allowedContentTypes: [.image, .png, .jpeg, .gif, .webP, .heic, .bmp], + allowsMultipleSelection: true + ) { result in + switch result { + case .success(let urls): + for url in urls { + if let mime = ImageAttachment.mimeType(forPathExtension: url.pathExtension) { + attachedImages.append(ImageAttachment(fileURL: url, mimeType: mime)) + } + } + case .failure: + // Silent — fileImporter surfaces its own error UI. + break + } + } + } + + // MARK: - Subviews + + private var thumbnailStrip: some View { + ScrollView(.horizontal, showsIndicators: false) { + HStack(spacing: 8) { + ForEach(attachedImages, id: \.fileURL) { att in + ZStack(alignment: .topTrailing) { + AsyncThumbnailImage(url: att.fileURL) + .frame(width: 56, height: 56) + .clipShape(RoundedRectangle(cornerRadius: 8)) + .overlay( + RoundedRectangle(cornerRadius: 8) + .stroke(Color.secondary.opacity(0.3), lineWidth: 1) + ) + + Button { + attachedImages.removeAll { $0.fileURL == att.fileURL } + } label: { + Image(systemName: "xmark.circle.fill") + .font(.system(size: 16)) + .foregroundStyle(.white, .black.opacity(0.7)) + } + .buttonStyle(.plain) + .offset(x: 6, y: -6) + } + } + } + .padding(.vertical, 2) + } + } + + private var inputRow: some View { HStack(alignment: .bottom, spacing: 8) { + // Image picker button (paperclip). Disabled when the loaded + // model can't take images. Tooltip explains why. + Button { + isFileImporterPresented = true + } label: { + Image(systemName: "photo.on.rectangle") + .foregroundStyle(canAttachImages ? .secondary : Color.secondary.opacity(0.4)) + .frame(width: 28, height: 28) + } + .buttonStyle(.plain) + .disabled(!canAttachImages || isGenerating || !isModelLoaded) + .help( + canAttachImages + ? "Attach image (jpeg, png, webp, gif, heic, bmp)" + : "Load a vision-capable model (Qwen-VL, Gemma-3, SmolVLM, …) to attach images" + ) + // Auto-growing TextField keeps the cursor vertically centered // on a single line and expands to up to 5 lines. macOS 14+. TextField( @@ -25,10 +117,6 @@ struct ChatInputView: View { .font(.body) .disabled(isGenerating || !isModelLoaded) .onSubmit { - // Cmd+Return still sends via the Send button's keyboard - // shortcut. Plain Return inserts newline (default for - // axis:.vertical). Shift+Return is identical — TextField - // handles it. if canSend { onSend() } } @@ -57,21 +145,33 @@ struct ChatInputView: View { .keyboardShortcut(.return, modifiers: .command) } } - .padding(.horizontal, 12) - .padding(.vertical, 8) - .background( - RoundedRectangle(cornerRadius: 14) - .stroke(Color.secondary.opacity(0.3), lineWidth: 1) - .background(.background, in: RoundedRectangle(cornerRadius: 14)) - ) - .padding(.horizontal, 16) - .padding(.vertical, 8) } private var canSend: Bool { - !text.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty - && isModelLoaded - && !isGenerating + let hasContent = !text.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty + || !attachedImages.isEmpty + return hasContent && isModelLoaded && !isGenerating + } +} + +/// Tiny disk-image thumbnail loader. Uses NSImage on the main actor — +/// images are small (≤120pt) so synchronous decode is fine. Gracefully +/// degrades to a placeholder glyph if the file can't be read. +struct AsyncThumbnailImage: View { + let url: URL + + var body: some View { + if let nsImage = NSImage(contentsOf: url) { + Image(nsImage: nsImage) + .resizable() + .aspectRatio(contentMode: .fill) + } else { + Image(systemName: "photo") + .imageScale(.large) + .foregroundStyle(.secondary) + .frame(maxWidth: .infinity, maxHeight: .infinity) + .background(Color.secondary.opacity(0.1)) + } } } @@ -79,15 +179,19 @@ struct ChatInputView: View { VStack { ChatInputView( text: .constant("Hello!"), + attachedImages: .constant([]), isGenerating: false, isModelLoaded: true, + canAttachImages: true, onSend: {}, onStop: {} ) ChatInputView( text: .constant(""), + attachedImages: .constant([]), isGenerating: true, isModelLoaded: true, + canAttachImages: false, onSend: {}, onStop: {} ) diff --git a/macMLX/macMLX/Views/Chat/ChatMessageView.swift b/macMLX/macMLX/Views/Chat/ChatMessageView.swift index f25cc74..2e86731 100644 --- a/macMLX/macMLX/Views/Chat/ChatMessageView.swift +++ b/macMLX/macMLX/Views/Chat/ChatMessageView.swift @@ -1,6 +1,7 @@ // ChatMessageView.swift // macMLX +import AppKit import SwiftUI import MacMLXCore @@ -87,6 +88,10 @@ struct ChatMessageView: View { @ViewBuilder private var bubble: some View { VStack(alignment: message.role == .user ? .trailing : .leading, spacing: 4) { + if !message.images.isEmpty { + imageStrip + } + renderedContent .textSelection(.enabled) .padding(.horizontal, 12) @@ -112,6 +117,32 @@ struct ChatMessageView: View { } } + /// Inline thumbnail strip rendered above the text bubble for any + /// message that has VLM image attachments (v0.4.1+). Click a + /// thumbnail to open the original in Preview via NSWorkspace. + @ViewBuilder + private var imageStrip: some View { + let columns = [GridItem(.adaptive(minimum: 96, maximum: 120), spacing: 8)] + LazyVGrid(columns: columns, alignment: .leading, spacing: 8) { + ForEach(message.images, id: \.fileURL) { att in + Button { + NSWorkspace.shared.open(att.fileURL) + } label: { + AsyncThumbnailImage(url: att.fileURL) + .frame(width: 96, height: 96) + .clipShape(RoundedRectangle(cornerRadius: 10)) + .overlay( + RoundedRectangle(cornerRadius: 10) + .stroke(Color.secondary.opacity(0.3), lineWidth: 1) + ) + } + .buttonStyle(.plain) + .help("Open \(att.fileURL.lastPathComponent) in Preview") + } + } + .frame(maxWidth: 320, alignment: message.role == .user ? .trailing : .leading) + } + private var bubbleBackground: Color { switch message.role { case .user: return .accentColor diff --git a/macMLX/macMLX/Views/Chat/ChatView.swift b/macMLX/macMLX/Views/Chat/ChatView.swift index 3f06439..379fccf 100644 --- a/macMLX/macMLX/Views/Chat/ChatView.swift +++ b/macMLX/macMLX/Views/Chat/ChatView.swift @@ -76,8 +76,10 @@ private struct ChatContent: View { // Input area ChatInputView( text: $viewModel.inputText, + attachedImages: $viewModel.attachedImages, isGenerating: viewModel.isGenerating, isModelLoaded: isModelLoaded, + canAttachImages: viewModel.canAttachImages, onSend: { Task { await viewModel.send() } }, diff --git a/macMLX/macMLX/Views/Chat/ChatViewModel.swift b/macMLX/macMLX/Views/Chat/ChatViewModel.swift index bb901da..8e4db77 100644 --- a/macMLX/macMLX/Views/Chat/ChatViewModel.swift +++ b/macMLX/macMLX/Views/Chat/ChatViewModel.swift @@ -20,6 +20,8 @@ struct UIChatMessage: Identifiable { let timestamp: Date var tokenCount: Int? var isGenerating: Bool + /// VLM image attachments tied to this message (v0.4.1+). + var images: [ImageAttachment] init( id: UUID = UUID(), @@ -27,7 +29,8 @@ struct UIChatMessage: Identifiable { content: String, timestamp: Date = Date(), tokenCount: Int? = nil, - isGenerating: Bool = false + isGenerating: Bool = false, + images: [ImageAttachment] = [] ) { self.id = id self.role = role @@ -35,6 +38,7 @@ struct UIChatMessage: Identifiable { self.timestamp = timestamp self.tokenCount = tokenCount self.isGenerating = isGenerating + self.images = images } /// Restore from persistence. @@ -45,6 +49,7 @@ struct UIChatMessage: Identifiable { self.timestamp = stored.timestamp self.tokenCount = stored.tokenCount self.isGenerating = false + self.images = stored.images } /// Dehydrate for persistence. Transient `isGenerating` is dropped. @@ -54,7 +59,8 @@ struct UIChatMessage: Identifiable { role: role, content: content, timestamp: timestamp, - tokenCount: tokenCount + tokenCount: tokenCount, + images: images ) } } @@ -68,6 +74,10 @@ final class ChatViewModel { var messages: [UIChatMessage] = [] var inputText: String = "" var isGenerating: Bool = false + /// Image attachments staged for the next user message (v0.4.1+). + /// `send()` picks these up, attaches them to the new turn, and + /// clears the bag. The chat input strip mirrors this array. + var attachedImages: [ImageAttachment] = [] /// All saved conversations, newest-first. Refreshed on `reloadConversationList()` /// — fires after every `persist()`, `switchTo(_:)`, `deleteConversation(_:)`, @@ -253,14 +263,47 @@ final class ChatViewModel { /// assistant's reply. func send() async { let text = inputText.trimmingCharacters(in: .whitespacesAndNewlines) - guard !text.isEmpty, - coordinator.currentModel != nil else { return } + guard coordinator.currentModel != nil else { return } + // Allow image-only messages on VLM models — a bare image with no + // accompanying prompt is a legitimate ask ("describe this"). + guard !text.isEmpty || !attachedImages.isEmpty else { return } + let images = attachedImages + attachedImages = [] inputText = "" - messages.append(UIChatMessage(role: .user, content: text)) + messages.append(UIChatMessage(role: .user, content: text, images: images)) await generate() } + // MARK: - Image attachments (v0.4.1) + + /// Whether the currently-loaded model accepts image attachments. + /// Drives the chat input's image-picker enabled state. + var canAttachImages: Bool { + coordinator.currentModel?.format == .mlxVLM + } + + /// Attach an image picked from disk, paste, or drop. Determines the + /// MIME type from the path extension. Silently rejects URLs whose + /// extension we don't recognise. + func attachImage(at url: URL) { + guard let mime = ImageAttachment.mimeType(forPathExtension: url.pathExtension) else { + return + } + attachedImages.append(ImageAttachment(fileURL: url, mimeType: mime)) + } + + /// Remove the staged attachment with the given file URL. No-op if + /// not present. + func removeAttachedImage(at url: URL) { + attachedImages.removeAll { $0.fileURL == url } + } + + /// Clear every staged attachment (e.g. on "New Chat"). + func clearAttachedImages() { + attachedImages = [] + } + // MARK: - Regenerate / Edit / Delete (#11) /// Re-run inference for the assistant message identified by @@ -345,7 +388,7 @@ final class ChatViewModel { let coreMessages: [ChatMessage] = messages .filter { !$0.isGenerating } - .map { ChatMessage(role: $0.role, content: $0.content) } + .map { ChatMessage(role: $0.role, content: $0.content, images: $0.images) } let params = parameters.parameters let request = GenerateRequest( From 9897a5455fd2342ce2972e0ac7ecc8cad3e005e5 Mon Sep 17 00:00:00 2001 From: Kefeng Zhou Date: Sun, 10 May 2026 13:07:34 +0700 Subject: [PATCH 3/4] feat(vlm): HummingbirdServer multimodal content array (OpenAI compat) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ChatCompletionRequest.Message.content now decodes either: - a plain string (every existing client) - or an OpenAI multimodal array of {type, text|image_url} parts Implementation lives in a new MultimodalContent enum that tries String first and falls through to [Part] — so legacy callers keep working unchanged. handleChatCompletions extracts text via .content.text (concatenated text parts), images via .content.extractImages(): - data:;base64, URLs decode to a tmpfile-backed ImageAttachment (jpeg/png/webp/gif/heic/bmp). Caps: 4 images per message, 10 MB per image. Oversized / unknown-MIME parts silently drop. - http(s):// and file:// are not fetched (defence-in-depth, even though the server is localhost-bound). Decoded ImageAttachments flow through ChatMessage.images → engine (VLM model receives them; LLM model logs + drops, per PR #34). Ollama /api/chat / /api/generate stays text-only — Ollama uses a separate top-level field that's a follow-up. 115/115 Core tests still green; existing chatCompletionsNonStreaming proves the string-form fallback path still works. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../MacMLXCore/Server/HummingbirdServer.swift | 124 +++++++++++++++++- 1 file changed, 120 insertions(+), 4 deletions(-) diff --git a/MacMLXCore/Sources/MacMLXCore/Server/HummingbirdServer.swift b/MacMLXCore/Sources/MacMLXCore/Server/HummingbirdServer.swift index 3adc3a1..47c57fa 100644 --- a/MacMLXCore/Sources/MacMLXCore/Server/HummingbirdServer.swift +++ b/MacMLXCore/Sources/MacMLXCore/Server/HummingbirdServer.swift @@ -8,10 +8,15 @@ import ServiceLifecycle // MARK: - OpenAI-compatible request/response types /// OpenAI-compatible chat completion request body. +/// +/// `Message.content` accepts either a plain string (text-only chat) or +/// an OpenAI multimodal content array of `{type, text|image_url}` parts +/// (v0.4.1+ — VLM models can read images this way). The decoder tries +/// string first, falls back to `[Part]`. See `MultimodalContent` below. private struct ChatCompletionRequest: Decodable, Sendable { struct Message: Decodable, Sendable { let role: String - let content: String + let content: MultimodalContent } let model: String @@ -22,6 +27,111 @@ private struct ChatCompletionRequest: Decodable, Sendable { let max_tokens: Int? } +/// OpenAI multimodal content payload. Either a plain string (text-only +/// — backwards compat with every existing client) or an array of typed +/// parts (`text` and `image_url`). The decoder tries the string form +/// first; on failure it falls through to an array of parts so we don't +/// reject older clients that send a bare string. +private enum MultimodalContent: Decodable, Sendable { + case string(String) + case parts([Part]) + + struct Part: Decodable, Sendable { + let type: String // "text" or "image_url" + let text: String? + let image_url: ImageURL? + } + struct ImageURL: Decodable, Sendable { + /// Either a `data:image/...;base64,XXXX` URL (only form we + /// currently decode — see `extractImages()`) or `http(s)://`. + /// `file://` is rejected by `extractImages()` for defence-in- + /// depth even though the server is localhost-bound. + let url: String + } + + init(from decoder: Decoder) throws { + let container = try decoder.singleValueContainer() + if let s = try? container.decode(String.self) { + self = .string(s) + return + } + let parts = try container.decode([Part].self) + self = .parts(parts) + } + + /// Concatenated text view — what the model sees as the prompt + /// content for this turn. Image parts are ignored (their bytes + /// flow into the engine separately via `extractImages()`). + var text: String { + switch self { + case .string(let s): + return s + case .parts(let parts): + return parts.compactMap { $0.type == "text" ? $0.text : nil } + .joined(separator: "\n") + } + } + + /// Decode any base64 data URLs into `ImageAttachment` values backed + /// by tmpfile copies. Caps: + /// - 4 images per call (further parts silently dropped) + /// - 10 MB per image (oversized parts silently dropped) + /// - data URL only — `http(s)://` and `file://` are not fetched + func extractImages() -> [ImageAttachment] { + guard case .parts(let parts) = self else { return [] } + var out: [ImageAttachment] = [] + for part in parts { + guard part.type == "image_url", + let urlStr = part.image_url?.url, + let attachment = MultimodalContent.decodeDataURL(urlStr) + else { continue } + out.append(attachment) + if out.count >= 4 { break } + } + return out + } + + /// Best-effort base64 data-URL → on-disk image. Returns nil on + /// any malformed input or unsupported MIME so callers can simply + /// drop the part. + private static func decodeDataURL(_ urlStr: String) -> ImageAttachment? { + guard urlStr.hasPrefix("data:") else { return nil } + let body = urlStr.dropFirst("data:".count) + let split = body.split(separator: ",", maxSplits: 1, omittingEmptySubsequences: false) + guard split.count == 2 else { return nil } + let header = String(split[0]) // e.g. "image/png;base64" + let payload = String(split[1]) + guard header.hasSuffix(";base64") else { return nil } + let mime = String(header.dropLast(";base64".count)) + + let ext: String + switch mime.lowercased() { + case "image/jpeg", "image/jpg": ext = "jpg" + case "image/png": ext = "png" + case "image/webp": ext = "webp" + case "image/gif": ext = "gif" + case "image/heic": ext = "heic" + case "image/bmp": ext = "bmp" + default: return nil + } + + guard let bytes = Data(base64Encoded: payload, options: .ignoreUnknownCharacters) else { + return nil + } + // 10 MB per image cap. + if bytes.count > 10 * 1024 * 1024 { return nil } + + let tmp = FileManager.default.temporaryDirectory + .appendingPathComponent("macmlx-http-img-\(UUID().uuidString).\(ext)") + do { + try bytes.write(to: tmp) + return ImageAttachment(fileURL: tmp, mimeType: mime) + } catch { + return nil + } + } +} + /// Request body for `/x/models/load`. private struct LoadModelRequest: Decodable, Sendable { let model_path: String @@ -568,13 +678,19 @@ public actor HummingbirdServer { // property, so leaving it in both places produces a duplicate // system turn, which Qwen3 / Gemma / other strict Jinja chat // templates reject with a TemplateException. - let systemPrompt = chatReq.messages.first(where: { $0.role == "system" })?.content + let systemPrompt = chatReq.messages.first(where: { $0.role == "system" })?.content.text // Map the rest (user / assistant), dropping unknown roles and - // the now-separated system turns. + // the now-separated system turns. Multimodal `content` arrays + // are split here: text parts → `content`, image_url data URLs + // → `images` via `extractImages()`. See MultimodalContent. let messages: [ChatMessage] = chatReq.messages.compactMap { msg in guard let role = MessageRole(rawValue: msg.role), role != .system else { return nil } - return ChatMessage(role: role, content: msg.content) + return ChatMessage( + role: role, + content: msg.content.text, + images: msg.content.extractImages() + ) } let params = GenerationParameters( From dfdadcfc8a9b8b1197bb97f0df218eff9860bf03 Mon Sep 17 00:00:00 2001 From: Kefeng Zhou Date: Sun, 10 May 2026 13:08:04 +0700 Subject: [PATCH 4/4] docs: v0.4.1 part-3 changelog entry (UI + persistence + HTTP) Closes the v0.4.1 vision-language model rollout: PR #33 Foundation + PR #34 Engine + this PR's UI / persistence / HTTP triple. Co-Authored-By: Claude Opus 4.7 (1M context) --- CHANGELOG.md | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5b5ea90..fced16a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -59,6 +59,44 @@ Versioning follows [Semantic Versioning](https://semver.org/). `macmlx serve` and `macmlx run`). Drop into Claude Desktop's `claude_desktop_config.json` as `{ "mcpServers": { "macmlx": { "command": "macmlx", "args": ["mcp", "serve"] } } }`. +- **VLM UI + Persistence + HTTP** (v0.4.1, part 3 of 3). Lights up + the user-facing surfaces for vision-language models. Closes the + v0.4.1 work begun in PRs #33 (Foundation) and #34 (Engine). + - **Chat input image picker.** New paperclip button in the chat + input opens SwiftUI's `.fileImporter` (image UTTypes only: + jpeg / png / webp / gif / heic / bmp), populating a horizontal + thumbnail strip above the text field. Click the × on a thumbnail + to drop it. The button is disabled when the loaded model isn't + a VLM, with an explanatory tooltip ("Load a vision-capable model + (Qwen-VL, Gemma-3, SmolVLM, …) to attach images"). Image-only + messages (no text) are now valid sends on a VLM. + - **Inline thumbnails on chat bubbles.** `ChatMessageView` renders + a 96pt LazyVGrid of attached images above the text bubble for + any turn that has images. Click a thumbnail to open the file in + Preview via `NSWorkspace`. + - **Conversation persistence.** `StoredMessage.images` round-trips + through `ConversationStore`. On save, every external image URL + is copied into `//images/` and the + stored URL is rewritten to point there — chats survive the user + moving the picked file. On `delete(id:)`, the per-conversation + directory is torn down so images don't leak. Pre-v0.4.1 + conversations decode unchanged (missing key → empty array). + - **OpenAI multimodal HTTP.** `/v1/chat/completions` now accepts + OpenAI's `content` array shape: + ```json + {"role":"user","content":[ + {"type":"text","text":"What's this?"}, + {"type":"image_url","image_url":{"url":"data:image/png;base64,…"}} + ]} + ``` + Plain-string `content` continues to work — the decoder tries + string first, falls through to `[Part]`. base64 data URLs + decode to tmpfile-backed `ImageAttachment` values; caps: 10 MB + per image, 4 images per message; `http(s)://` and `file://` + URLs are not fetched (defence-in-depth on a localhost-bound + server). Ollama's `/api/chat` / `/api/generate` stay text-only + — Ollama's wire format uses a separate top-level + `images: [base64]` field; revisit in a follow-up. - **VLM Engine** (v0.4.1, part 2 of 3). MLXSwiftEngine now branches on `model.format` to load text-only models through `MLXLLM.LLMModelFactory` and vision-language models through