Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,27 @@ Versioning follows [Semantic Versioning](https://semver.org/).
`macmlx serve` and `macmlx run`). Drop into Claude Desktop's
`claude_desktop_config.json` as
`{ "mcpServers": { "macmlx": { "command": "macmlx", "args": ["mcp", "serve"] } } }`.
- **VLM Foundation** (v0.4.1, part 1 of 3). Pure-Swift Core changes
for vision-language model support. No MLX integration yet, no UI,
no HTTP changes.
- `ImageAttachment` value type (`fileURL`, `mimeType`) sits next
to `LocalModel` / `HFModel` and round-trips through Codable.
MIME-type helper covers jpeg / png / webp / gif / heic / bmp.
- `ChatMessage` gains an `images: [ImageAttachment]` field with a
custom `init(from:)` that defaults to empty when the key is
absent — pre-v0.4.1 conversation JSON loads unchanged, no
migration step.
- `ModelFormat.mlxVLM` distinguishes vision-language directories.
`ModelLibraryManager.scan(_:)` peeks `config.json`'s
`model_type` and tags 14 known VLM families: qwen2_vl,
qwen2_5_vl, qwen3_vl, qwen3_5_vl, gemma3, smolvlm, smolvlm2,
paligemma, pixtral, idefics3, fast_vlm, lfm2_vl, glm_ocr,
mistral3. Malformed / missing `model_type` falls back to
`.mlx`. 13 new unit tests cover detection edge cases.
- Engine integration (MLXSwiftEngine VLM branch via
`MLXVLM.VLMModelFactory`) and UI / HTTP work land in follow-up
PRs — see
`docs/superpowers/plans/2026-05-10-v0.4.1-vlm.md`.

---

Expand Down
26 changes: 25 additions & 1 deletion MacMLXCore/Sources/MacMLXCore/Engine/GenerateRequest.swift
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,35 @@ public struct ChatMessage: Codable, Hashable, Identifiable, Sendable {
public let id: UUID
public let role: MessageRole
public let content: String
/// Image attachments. Empty for text-only messages — the common
/// case. Backwards compatible: pre-v0.4.1 conversation JSON (which
/// has no `images` key) decodes with an empty array, so existing
/// user chats survive the upgrade unchanged.
public let images: [ImageAttachment]

public init(id: UUID = UUID(), role: MessageRole, content: String) {
public init(
id: UUID = UUID(),
role: MessageRole,
content: String,
images: [ImageAttachment] = []
) {
self.id = id
self.role = role
self.content = content
self.images = images
}

private enum CodingKeys: String, CodingKey {
case id, role, content, images
}

public init(from decoder: Decoder) throws {
let c = try decoder.container(keyedBy: CodingKeys.self)
self.id = try c.decode(UUID.self, forKey: .id)
self.role = try c.decode(MessageRole.self, forKey: .role)
self.content = try c.decode(String.self, forKey: .content)
// Default to empty when the key is absent (legacy conversations).
self.images = try c.decodeIfPresent([ImageAttachment].self, forKey: .images) ?? []
}
}

Expand Down
65 changes: 62 additions & 3 deletions MacMLXCore/Sources/MacMLXCore/Managers/ModelLibraryManager.swift
Original file line number Diff line number Diff line change
Expand Up @@ -54,10 +54,27 @@ public actor ModelLibraryManager {

switch format {
case .mlx:
// Peek `config.json` `model_type` — upgrade to .mlxVLM
// when the directory contains a vision-language model.
let upgradedFormat = upgradeFormatIfVLM(directory: itemURL)
let model = buildLocalModel(
dirName: dirName,
dirURL: itemURL,
fileURLs: fileURLs
fileURLs: fileURLs,
format: upgradedFormat
)
results.append(model)

case .mlxVLM:
// `ModelFormat.detect(in:)` never returns this directly
// — it's set by `upgradeFormatIfVLM` above. Reachable
// only via tests that hand-craft a format. Fall through
// to the same path as `.mlx`.
let model = buildLocalModel(
dirName: dirName,
dirURL: itemURL,
fileURLs: fileURLs,
format: .mlxVLM
)
results.append(model)

Expand All @@ -84,7 +101,8 @@ public actor ModelLibraryManager {
private func buildLocalModel(
dirName: String,
dirURL: URL,
fileURLs: [URL]
fileURLs: [URL],
format: ModelFormat = .mlx
) -> LocalModel {
// Sum all .safetensors files for reported size
let sizeBytes: Int64 = fileURLs
Expand All @@ -104,13 +122,54 @@ public actor ModelLibraryManager {
displayName: dirName,
directory: dirURL,
sizeBytes: sizeBytes,
format: .mlx,
format: format,
quantization: quantization,
parameterCount: nil, // Deferred — requires config.json parser (v0.3+)
architecture: nil // Deferred — requires config.json parser (v0.3+)
)
}

/// `model_type` values mlx-swift-lm's `MLXVLM` library supports.
///
/// Source of truth: `Libraries/MLXVLM/Models/*.swift` registry in
/// the mlx-swift-lm checkout. Refresh this set when bumping the
/// SPM dependency. Stored lowercased — comparisons are
/// case-insensitive against `config.json`.
private static let knownVLMTypes: Set<String> = [
"qwen2_vl",
"qwen2_5_vl",
"qwen3_vl",
"qwen3_5_vl",
"gemma3",
"smolvlm",
"smolvlm2",
"paligemma",
"pixtral",
"idefics3",
"fast_vlm",
"lfm2_vl",
"glm_ocr",
"mistral3",
]

/// Peek `config.json`'s `model_type`. Returns `.mlxVLM` if the
/// type matches a known VLM family; otherwise `.mlx`.
///
/// Best-effort: any read or parse failure (missing file, malformed
/// JSON, missing `model_type` key) falls back to `.mlx` — the scan
/// must not blow up because of one unparseable config.
private func upgradeFormatIfVLM(directory: URL) -> ModelFormat {
let configURL = directory.appendingPathComponent("config.json")
guard let data = try? Data(contentsOf: configURL),
let json = try? JSONSerialization.jsonObject(with: data) as? [String: Any],
let modelType = json["model_type"] as? String,
Self.knownVLMTypes.contains(modelType.lowercased())
else {
return .mlx
}
return .mlxVLM
}

/// Extracts a quantization string from a directory name.
///
/// Matches a trailing `-(\d+bit)` suffix, e.g. `Qwen3-8B-4bit` → `"4bit"`.
Expand Down
50 changes: 50 additions & 0 deletions MacMLXCore/Sources/MacMLXCore/Models/ImageAttachment.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import Foundation

/// One image attached to a `ChatMessage`.
///
/// Sendable + Codable so the attachment round-trips cleanly through
/// `ConversationStore` JSON, `GenerateRequest` payloads, and the
/// OpenAI-multimodal HTTP wire format. The carrier lives in Core
/// (not the GUI target) because the CLI surfaces, the HTTP server,
/// and the GUI all need to talk about it.
public struct ImageAttachment: Codable, Hashable, Sendable, Equatable {
/// Local file URL where the image bytes live.
///
/// Conversation save / load is responsible for copying user-picked
/// files into `~/.mac-mlx/conversations/<uuid>/images/` so the URL
/// stays stable across app restarts and survives the user moving
/// the original file. Until the persistence step lands (v0.4.1
/// UI+HTTP PR), the URL points at the user's pick site directly.
public let fileURL: URL

/// IANA MIME type, e.g. `image/jpeg` or `image/png`.
///
/// Required for the OpenAI multimodal `image_url` data-URL payload
/// shape (`data:<mime>;base64,…`). We carry it explicitly rather
/// than re-deriving from `fileURL.pathExtension` at every send
/// because some pickers return URLs with empty extensions
/// (Photos library picks, paste-from-clipboard temp files).
public let mimeType: String

public init(fileURL: URL, mimeType: String) {
self.fileURL = fileURL
self.mimeType = mimeType
}

/// Best-effort MIME-type lookup from a path extension.
///
/// Returns `nil` for extensions we don't recognise — callers
/// should treat that as "this isn't an image we know how to
/// attach" and reject the file. Case-insensitive.
public static func mimeType(forPathExtension ext: String) -> String? {
switch ext.lowercased() {
case "jpg", "jpeg": return "image/jpeg"
case "png": return "image/png"
case "webp": return "image/webp"
case "gif": return "image/gif"
case "heic": return "image/heic"
case "bmp": return "image/bmp"
default: return nil
}
}
}
11 changes: 11 additions & 0 deletions MacMLXCore/Sources/MacMLXCore/Models/LocalModel.swift
Original file line number Diff line number Diff line change
Expand Up @@ -53,10 +53,21 @@ public struct LocalModel: Codable, Hashable, Identifiable, Sendable {
/// Recognised on-disk model formats.
public enum ModelFormat: String, Codable, Hashable, Sendable, CaseIterable {
case mlx
/// Vision-language model (v0.4.1+). Same on-disk shape as `.mlx`,
/// distinguished by `model_type` in `config.json`. The library
/// scan first runs `detect(in:)` to filter MLX / GGUF / unknown
/// from the file listing, then upgrades `.mlx` → `.mlxVLM` if the
/// `model_type` matches a known VLM family.
case mlxVLM
case gguf
case unknown

/// Heuristic classifier from a directory's file listing.
///
/// File-listing inspection only — no I/O on the contents. Returns
/// `.mlx` for any directory that looks like an MLX text model;
/// `ModelLibraryManager.scan(_:)` is responsible for the further
/// `.mlx` → `.mlxVLM` upgrade based on `config.json`.
public static func detect(in fileNames: [String]) -> ModelFormat {
let lower = fileNames.map { $0.lowercased() }
if lower.contains(where: { $0.hasSuffix(".gguf") }) { return .gguf }
Expand Down
67 changes: 67 additions & 0 deletions MacMLXCore/Tests/MacMLXCoreTests/Engine/ChatMessageTests.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import Testing
import Foundation
@testable import MacMLXCore

@Suite("ChatMessage.images")
struct ChatMessageImagesTests {

@Test
func defaultsToEmptyImagesWhenInitWithoutField() {
let m = ChatMessage(role: .user, content: "Hi")
#expect(m.images.isEmpty)
}

@Test
func preservesAttachedImages() {
let img = ImageAttachment(
fileURL: URL(fileURLWithPath: "/tmp/x.jpg"),
mimeType: "image/jpeg"
)
let m = ChatMessage(role: .user, content: "What is this?", images: [img])
#expect(m.images == [img])
}

/// Pre-v0.4.1 conversation JSON has no `images` field. The decoder
/// must default-construct an empty array so existing on-disk
/// conversations load unchanged after the upgrade.
@Test
func legacyJSONWithoutImagesFieldDecodesWithEmptyArray() throws {
let legacy = """
{
"id": "1FAA0000-0000-0000-0000-000000000001",
"role": "user",
"content": "Hi"
}
"""
let data = Data(legacy.utf8)
let decoded = try JSONDecoder().decode(ChatMessage.self, from: data)
#expect(decoded.role == .user)
#expect(decoded.content == "Hi")
#expect(decoded.images.isEmpty)
}

@Test
func newJSONRoundTripsImages() throws {
let img = ImageAttachment(
fileURL: URL(fileURLWithPath: "/tmp/cat.jpg"),
mimeType: "image/jpeg"
)
let original = ChatMessage(role: .user, content: "Describe.", images: [img])
let data = try JSONEncoder().encode(original)
let back = try JSONDecoder().decode(ChatMessage.self, from: data)
#expect(back.id == original.id)
#expect(back.role == original.role)
#expect(back.content == original.content)
#expect(back.images.count == 1)
#expect(back.images.first?.fileURL == img.fileURL)
#expect(back.images.first?.mimeType == img.mimeType)
}

@Test
func emptyImagesArrayRoundTrips() throws {
let original = ChatMessage(role: .assistant, content: "Sure.")
let data = try JSONEncoder().encode(original)
let back = try JSONDecoder().decode(ChatMessage.self, from: data)
#expect(back.images.isEmpty)
}
}
Loading