From 0469a14a227f7f0800d36952baa7859312293546 Mon Sep 17 00:00:00 2001 From: Jacob Fu <141651335+FuJacob@users.noreply.github.com> Date: Wed, 10 Jun 2026 21:06:27 -0700 Subject: [PATCH] Skip discarded per-token logprob, cache display geometry, hoist normalizer statics App half of the #661 performance pass; pairs with cotabbyinference 6e1a9ba (P-core decode threads, gated logprob, halved KV allocation). - LlamaRuntimeCore now tells the engine to skip per-token log-probabilities whenever confidenceFloor == -infinity, the shipping default where ConfidenceSuppressionPolicy returns before reading the value. Every generated token was paying two O(vocab) passes plus a vocab-wide exp() to produce a number that was summed and discarded. Suggestions are byte-identical; raising the floor re-enables the computation per request. - AXHelper.displayGeometries() is cached and invalidated on didChangeScreenParameters instead of rebuilding NSScreen.screens + CGDisplayBounds for every AX rect conversion at the focus-poll cadence. - SuggestionTextNormalizer: -stripping short-circuits on a contains check before compiling either regex (both patterns require the literal tag), and the scaffolding-label list is length-sorted once, statically, instead of on every prediction. --- .../Services/Runtime/LlamaRuntimeCore.swift | 12 ++++++++ Cotabby/Support/AXHelper.swift | 29 ++++++++++++++++++- .../Support/SuggestionTextNormalizer.swift | 13 ++++++++- 3 files changed, 52 insertions(+), 2 deletions(-) diff --git a/Cotabby/Services/Runtime/LlamaRuntimeCore.swift b/Cotabby/Services/Runtime/LlamaRuntimeCore.swift index ff29a1eb..5384b9cf 100644 --- a/Cotabby/Services/Runtime/LlamaRuntimeCore.swift +++ b/Cotabby/Services/Runtime/LlamaRuntimeCore.swift @@ -381,6 +381,14 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable { // Seed for the reuse path is sampled at the end of this decodePrompt; apply // the word-continuation constraint to it just like the fresh path does. engine.setForceWordContinuation(autocompleteSequenceID, options.forceWordContinuation) + // Per-token log-probabilities cost two O(vocab) passes each in the engine; + // only compute them when the confidence gate would actually read them. + // Re-assert per request: the floor is not part of the sampling fingerprint, + // so a reused sequence must not carry a stale flag. + engine.setComputeLogprob( + autocompleteSequenceID, + options.confidenceFloor > -.infinity + ) var mutableRemaining = remaining let status = engine.decodePrompt( autocompleteSequenceID, @@ -420,6 +428,10 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable { // The engine samples the first (seed) token at the end of decodePrompt, so set the // word-continuation constraint here, before decoding. engine.setForceWordContinuation(seqID, options.forceWordContinuation) + // Skip the engine's per-token log-probability work (two O(vocab) passes per token) + // whenever confidence suppression is disabled — the shipping default — since the value + // would be summed and then discarded. + engine.setComputeLogprob(seqID, options.confidenceFloor > -.infinity) var tokens = promptTokens let status = engine.decodePrompt(seqID, &tokens, Int32(tokens.count), 0) diff --git a/Cotabby/Support/AXHelper.swift b/Cotabby/Support/AXHelper.swift index 773cdfb2..ef740234 100644 --- a/Cotabby/Support/AXHelper.swift +++ b/Cotabby/Support/AXHelper.swift @@ -774,8 +774,33 @@ enum AXHelper { return flipped } + /// Cached display list. Display configuration changes are rare (plug/unplug, resolution or + /// arrangement changes), but `cocoaRect`/`validatedCocoaTextRect` run for every AX rect at the + /// focus-poll cadence — rebuilding `NSScreen.screens` + `CGDisplayBounds` per conversion + /// multiplied AppKit/CoreGraphics traffic by the resolve rate for identical results. All AX + /// geometry work happens on the main thread, so unsynchronized statics are safe here. + private static var cachedDisplayGeometries: [DisplayGeometry]? + + /// Invalidation hook for the cache above. macOS posts `didChangeScreenParameters` for every + /// event that can alter the display list (connect/disconnect, resolution, arrangement, Dock + /// and menu-bar resizes affecting `visibleFrame`). Lazily installed via the first + /// `displayGeometries()` call, so the observer always exists before a cached value could go + /// stale. + private static let displayChangeObserver: NSObjectProtocol = NotificationCenter.default.addObserver( + forName: NSApplication.didChangeScreenParametersNotification, + object: nil, + queue: .main + ) { _ in + cachedDisplayGeometries = nil + } + private static func displayGeometries() -> [DisplayGeometry] { - NSScreen.screens.compactMap { screen in + _ = displayChangeObserver + if let cachedDisplayGeometries { + return cachedDisplayGeometries + } + + let geometries = NSScreen.screens.compactMap { screen -> DisplayGeometry? in guard let number = screen.deviceDescription[NSDeviceDescriptionKey("NSScreenNumber")] as? NSNumber else { @@ -790,6 +815,8 @@ enum AXHelper { backingScaleFactor: screen.backingScaleFactor ) } + cachedDisplayGeometries = geometries + return geometries } /// Last-resort fallback for unusual virtual displays where AppKit cannot expose a display ID. diff --git a/Cotabby/Support/SuggestionTextNormalizer.swift b/Cotabby/Support/SuggestionTextNormalizer.swift index 03707d36..eafd1422 100644 --- a/Cotabby/Support/SuggestionTextNormalizer.swift +++ b/Cotabby/Support/SuggestionTextNormalizer.swift @@ -188,6 +188,13 @@ enum SuggestionTextNormalizer { /// Removes `` reasoning blocks: complete blocks first, then any dangling open /// tag left when generation hit the token limit before the block was closed. private static func stripThinkBlocks(_ text: String) -> String { + // Both patterns below require a literal ``, so this cheap scan lets the common case + // (no reasoning block — the vast majority of completions) skip regex work entirely. + // `String.range(of:options:.regularExpression)` compiles its pattern on every call, and + // this runs on the per-prediction critical path. + guard text.contains("") else { + return text + } var result = text if let complete = result.range(of: "[\\s\\S]*?", options: .regularExpression) { result.replaceSubrange(complete, with: "") @@ -269,12 +276,16 @@ enum SuggestionTextNormalizer { "App:" ] + /// `scaffoldingLabels` ordered longest-first, computed once. The ordering is what makes + /// "Text before the caret:" win over a shorter sibling; sorting on every call repeated that + /// work on the per-prediction critical path for an identical result. + private static let labelsByLengthDescending: [String] = scaffoldingLabels.sorted { $0.count > $1.count } + /// Removes a leading run of known prompt-scaffolding labels (see `scaffoldingLabels`), whether /// each sits on its own line or inline before the continuation. Only labels at the very start /// are stripped; a label appearing later in the text is left alone because by then it is far /// more likely to be real user content than echoed scaffolding. private static func stripLeadingScaffoldingLabels(_ text: String) -> String { - let labelsByLengthDescending = scaffoldingLabels.sorted { $0.count > $1.count } var working = text while true {