From 0d64c2d06bb427086a8d87d3788313f15bed30df Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dirk=20Ja=CC=88ckel?= <github@dirk.jaeckel.name>
Date: Wed, 29 Apr 2026 12:04:04 +0200
Subject: [PATCH 01/20] Replace AI router debug Toast with bottom auto-hiding
 status strip
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The Toast spammed the screen on every generateXxx call and obscured
the bookmark form so the user couldn't interact while it was visible.
Replace it with a slim Material strip pinned to the bottom of the
screen that:

- shows "<action>: <model>…" with a spinner while inference runs
- shows "<action> via <model> — 1.23 s" on completion
- shows "<action>: no model ready (<reason>)" on failure
- auto-hides 3.5 s after Completed, 5 s after NoProvider

To time the call, LocalModelRouter now wraps each provider call with
a wall-clock measurement and emits a new RouteEvent.Completed with
durationMs + success. Picked is still emitted up front for the
in-flight state.

Layout: the strip lives as the last child of the outer Column with a
weighted Box wrapping the active screen, so when AnimatedVisibility
expands the strip from 0-height the screen above is pushed up — the
Save / Cancel button row stays visible and responsive. The outer
Column consumes the navigation bar inset once via
navigationBarsPadding(), so neither the screen's BottomAppBar nor the
strip applies its own gesture-pill padding (which previously caused
the BottomAppBar's own box to grow when the strip slid in).

DEBUG-only, matching the gating of the previous Toast.
---
 .../jaeckel/urlvault/android/MainActivity.kt  |  88 ++++++++---
 .../urlvault/android/ai/LocalModelRouter.kt   | 123 ++++++++++-----
 .../urlvault/ui/AiActivityStatusLine.kt       | 142 ++++++++++++++++++
 3 files changed, 298 insertions(+), 55 deletions(-)
 create mode 100644 shared/src/commonMain/kotlin/com/jaeckel/urlvault/ui/AiActivityStatusLine.kt
diff --git a/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/MainActivity.kt b/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/MainActivity.kt
index 6c02822..32c481c 100644
--- a/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/MainActivity.kt
+++ b/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/MainActivity.kt
@@ -2,11 +2,13 @@ package com.jaeckel.urlvault.android
 
 import android.content.Intent
 import android.os.Bundle
-import android.widget.Toast
 import androidx.activity.ComponentActivity
 import androidx.activity.compose.setContent
 import androidx.activity.enableEdgeToEdge
+import androidx.compose.foundation.layout.Box
 import androidx.compose.foundation.layout.Column
+import androidx.compose.foundation.layout.fillMaxSize
+import androidx.compose.foundation.layout.navigationBarsPadding
 import androidx.compose.foundation.layout.statusBarsPadding
 import androidx.compose.ui.Modifier
 import androidx.compose.runtime.LaunchedEffect
@@ -16,6 +18,7 @@ import androidx.compose.runtime.mutableStateOf
 import androidx.compose.runtime.produceState
 import androidx.compose.runtime.remember
 import androidx.compose.runtime.setValue
+import kotlinx.coroutines.delay
 import com.jaeckel.urlvault.ai.AiProviderIds
 import com.jaeckel.urlvault.ai.ModelCatalog
 import com.jaeckel.urlvault.ai.ModelCatalogEntry
@@ -30,6 +33,8 @@ import com.jaeckel.urlvault.android.sync.AndroidBitwardenPreferences
 import com.jaeckel.urlvault.model.Bookmark
 import com.jaeckel.urlvault.sync.BitwardenSyncService
 import com.jaeckel.urlvault.ui.AddEditBookmarkScreen
+import com.jaeckel.urlvault.ui.AiActivityState
+import com.jaeckel.urlvault.ui.AiActivityStatusLine
 import com.jaeckel.urlvault.ui.BookmarkListScreen
 import com.jaeckel.urlvault.ui.ModelComparisonScreen
 import com.jaeckel.urlvault.ui.ModelStatusBanner
@@ -99,28 +104,47 @@ class MainActivity : ComponentActivity() {
                     aiCoreService.initialize()
                 }
 
-                // DEBUG-only: surface which provider actually served each AI call
-                // so we can confirm an "activated" model is what's being used vs.
-                // silently falling back to AICore.
+                // DEBUG-only: surface which provider actually served each AI
+                // call (and how long it took) in a thin auto-hiding strip at
+                // the bottom of the screen. Replaces a much louder Toast that
+                // obscured the form while the user was trying to interact
+                // with it.
+                var aiActivity by remember { mutableStateOf<AiActivityState>(AiActivityState.Hidden) }
                 if (BuildConfig.DEBUG) {
                     LaunchedEffect(Unit) {
                         localModelRouter.events.collect { event ->
-                            val readinessLine = event.readiness.joinToString { (id, r) ->
-                                "${id.substringAfter(':')}=${if (r) "✓" else "✗"}"
-                            }
-                            val activeLine = if (event.activeIds.isEmpty()) "active=none"
-                                else "active=${event.activeIds.joinToString { it.substringAfter(':') }}"
-                            val head = when (event) {
+                            aiActivity = when (event) {
                                 is LocalModelRouter.RouteEvent.Picked ->
-                                    "AI ${event.action}: ${event.providerName}\n${event.reason}"
+                                    AiActivityState.Running(event.action, event.providerName)
+                                is LocalModelRouter.RouteEvent.Completed ->
+                                    AiActivityState.Completed(
+                                        action = event.action,
+                                        providerName = event.providerName,
+                                        durationMs = event.durationMs,
+                                        success = event.success,
+                                    )
                                 is LocalModelRouter.RouteEvent.None ->
-                                    "AI ${event.action}: NO PROVIDER\n${event.reason}"
+                                    AiActivityState.NoProvider(event.action, event.reason)
                             }
-                            val text = "$head\n$activeLine\n$readinessLine"
-                            Toast.makeText(this@MainActivity, text, Toast.LENGTH_LONG).show()
                         }
                     }
                 }
+                // Auto-hide once the user has had time to read the result.
+                // Running stays visible for as long as the LLM is working
+                // (we only transition out of it when Completed/None arrive).
+                LaunchedEffect(aiActivity) {
+                    when (aiActivity) {
+                        is AiActivityState.Completed -> {
+                            delay(3_500)
+                            aiActivity = AiActivityState.Hidden
+                        }
+                        is AiActivityState.NoProvider -> {
+                            delay(5_000)
+                            aiActivity = AiActivityState.Hidden
+                        }
+                        else -> {}
+                    }
+                }
 
                 // Show toggle for any status except Unknown (still probing)
                 val aiCoreAvailable = aiCoreStatus !is AICoreStatus.Unknown && aiCoreStatus !is AICoreStatus.Unavailable
@@ -153,10 +177,18 @@ class MainActivity : ComponentActivity() {
                 }
 
                 Column(
-                    // enableEdgeToEdge() lets content draw under the status
-                    // bar; without statusBarsPadding the banner would land
-                    // behind the system clock / battery icons.
-                    modifier = Modifier.statusBarsPadding(),
+                    // enableEdgeToEdge() lets content draw under the system
+                    // bars; the two *barsPadding modifiers reserve space at
+                    // top and bottom AND consume the corresponding insets so
+                    // descendants (notably the screens' Material Scaffolds
+                    // with BottomAppBar) don't double-pad. Without this, the
+                    // BottomAppBar kept its own gesture-pill padding even
+                    // when the AI activity strip slid in below it, making
+                    // the button row's box visibly grow.
+                    modifier = Modifier
+                        .fillMaxSize()
+                        .statusBarsPadding()
+                        .navigationBarsPadding(),
                 ) {
                     // Persistent status banner — surfaces the active model
                     // warming up or any in-flight download regardless of which
@@ -169,6 +201,12 @@ class MainActivity : ComponentActivity() {
                         catalog = ModelCatalog.builtIn + customEntries,
                         aiCoreId = AiProviderIds.AICORE,
                     )
+                    // Wrap the active screen in a weighted Box so the AI
+                    // activity strip below can claim its natural height
+                    // without overlapping the screen's own bottom buttons —
+                    // when the strip is visible the screen's available
+                    // height shrinks and its Save / Cancel row reflows up.
+                    Box(modifier = Modifier.weight(1f).fillMaxSize()) {
                 when (val screen = currentScreen) {
                     is Screen.List -> BookmarkListScreen(
                         viewModel = bookmarkViewModel,
@@ -302,7 +340,19 @@ class MainActivity : ComponentActivity() {
                         )
                     }
                 }
-                }   // close Column wrapping the banner + screen content
+                }   // close weighted Box wrapping the screen
+
+                    // DEBUG-only AI activity strip. Last child of the Column
+                    // so when AnimatedVisibility expands it from 0-height
+                    // the screen above is pushed up — its Save button stays
+                    // visible. The outer Column already consumed the nav
+                    // bar inset, so the strip needs no padding of its own.
+                    if (BuildConfig.DEBUG) {
+                        AiActivityStatusLine(
+                            state = aiActivity,
+                        )
+                    }
+                }   // close outer Column
             }
         }
     }
diff --git a/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LocalModelRouter.kt b/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LocalModelRouter.kt
index 5847593..704086b 100644
--- a/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LocalModelRouter.kt
+++ b/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LocalModelRouter.kt
@@ -51,6 +51,23 @@ class LocalModelRouter(
             override val readiness: List<Pair<String, Boolean>>,
             val reason: String,
         ) : RouteEvent()
+
+        /**
+         * Fired by `generateXxx` *after* the provider call returns or throws.
+         * Carries the wall-clock duration so a UI status line can show
+         * "tags via Liquid LFM2 Extract — 1247 ms". Note that for `title` on
+         * pages with a usable `<title>`/`og:title`, no LLM ran — duration
+         * reflects only the page fetch, which is intentional.
+         */
+        data class Completed(
+            override val action: String,
+            override val activeIds: Set<String>,
+            override val readiness: List<Pair<String, Boolean>>,
+            val providerId: String,
+            val providerName: String,
+            val durationMs: Long,
+            val success: Boolean,
+        ) : RouteEvent()
     }
 
     private val _events = MutableSharedFlow<RouteEvent>(extraBufferCapacity = 16)
@@ -109,33 +126,6 @@ class LocalModelRouter(
         return PickResult(fallback, reason, active, readinessSummary)
     }
 
-    private suspend fun pickAndEmit(action: String): LocalModelProvider? {
-        val result = pickWithReason()
-        val provider = result.provider
-        if (provider != null) {
-            _events.tryEmit(
-                RouteEvent.Picked(
-                    action = action,
-                    activeIds = result.activeIds,
-                    readiness = result.readiness,
-                    providerId = provider.id,
-                    providerName = provider.displayName,
-                    reason = result.reason,
-                ),
-            )
-        } else {
-            _events.tryEmit(
-                RouteEvent.None(
-                    action = action,
-                    activeIds = result.activeIds,
-                    readiness = result.readiness,
-                    reason = result.reason,
-                ),
-            )
-        }
-        return provider
-    }
-
     /**
      * Whether at least one registered provider can serve a request right now.
      * Used by the UI to decide whether to drive bookmark generation through
@@ -181,20 +171,81 @@ class LocalModelRouter(
     }
 
     suspend fun generateTags(url: String, title: String, content: String): Result<List<String>> {
-        val provider = pickAndEmit("tags")
-            ?: return Result.failure(IllegalStateException("No ready local AI model"))
-        return provider.generateTags(url, title, content)
+        val pick = pickWithReason()
+        val provider = pick.provider
+        if (provider == null) {
+            emitNone("tags", pick)
+            return Result.failure(IllegalStateException("No ready local AI model"))
+        }
+        emitPicked("tags", provider, pick)
+        return runTimed("tags", provider, pick) { provider.generateTags(url, title, content) }
     }
 
     suspend fun generateDescription(url: String, title: String): Result<String> {
-        val provider = pickAndEmit("description")
-            ?: return Result.failure(IllegalStateException("No ready local AI model"))
-        return provider.generateDescription(url, title)
+        val pick = pickWithReason()
+        val provider = pick.provider
+        if (provider == null) {
+            emitNone("description", pick)
+            return Result.failure(IllegalStateException("No ready local AI model"))
+        }
+        emitPicked("description", provider, pick)
+        return runTimed("description", provider, pick) { provider.generateDescription(url, title) }
     }
 
     suspend fun generateTitle(url: String): Result<String> {
-        val provider = pickAndEmit("title")
-            ?: return Result.failure(IllegalStateException("No ready local AI model"))
-        return provider.generateTitle(url)
+        val pick = pickWithReason()
+        val provider = pick.provider
+        if (provider == null) {
+            emitNone("title", pick)
+            return Result.failure(IllegalStateException("No ready local AI model"))
+        }
+        emitPicked("title", provider, pick)
+        return runTimed("title", provider, pick) { provider.generateTitle(url) }
+    }
+
+    private fun emitPicked(action: String, provider: LocalModelProvider, pick: PickResult) {
+        _events.tryEmit(
+            RouteEvent.Picked(
+                action = action,
+                activeIds = pick.activeIds,
+                readiness = pick.readiness,
+                providerId = provider.id,
+                providerName = provider.displayName,
+                reason = pick.reason,
+            ),
+        )
+    }
+
+    private fun emitNone(action: String, pick: PickResult) {
+        _events.tryEmit(
+            RouteEvent.None(
+                action = action,
+                activeIds = pick.activeIds,
+                readiness = pick.readiness,
+                reason = pick.reason,
+            ),
+        )
+    }
+
+    private suspend inline fun <T> runTimed(
+        action: String,
+        provider: LocalModelProvider,
+        pick: PickResult,
+        block: () -> Result<T>,
+    ): Result<T> {
+        val t0 = System.currentTimeMillis()
+        val result = block()
+        _events.tryEmit(
+            RouteEvent.Completed(
+                action = action,
+                activeIds = pick.activeIds,
+                readiness = pick.readiness,
+                providerId = provider.id,
+                providerName = provider.displayName,
+                durationMs = System.currentTimeMillis() - t0,
+                success = result.isSuccess,
+            ),
+        )
+        return result
     }
 }
diff --git a/shared/src/commonMain/kotlin/com/jaeckel/urlvault/ui/AiActivityStatusLine.kt b/shared/src/commonMain/kotlin/com/jaeckel/urlvault/ui/AiActivityStatusLine.kt
new file mode 100644
index 0000000..6164592
--- /dev/null
+++ b/shared/src/commonMain/kotlin/com/jaeckel/urlvault/ui/AiActivityStatusLine.kt
@@ -0,0 +1,142 @@
+package com.jaeckel.urlvault.ui
+
+import androidx.compose.animation.AnimatedVisibility
+import androidx.compose.animation.fadeIn
+import androidx.compose.animation.fadeOut
+import androidx.compose.animation.slideInVertically
+import androidx.compose.animation.slideOutVertically
+import androidx.compose.foundation.layout.Arrangement
+import androidx.compose.foundation.layout.Row
+import androidx.compose.foundation.layout.fillMaxWidth
+import androidx.compose.foundation.layout.padding
+import androidx.compose.foundation.layout.size
+import androidx.compose.material3.CircularProgressIndicator
+import androidx.compose.material3.MaterialTheme
+import androidx.compose.material3.Surface
+import androidx.compose.material3.Text
+import androidx.compose.runtime.Composable
+import androidx.compose.ui.Alignment
+import androidx.compose.ui.Modifier
+import androidx.compose.ui.graphics.Color
+import androidx.compose.ui.text.font.FontFamily
+import androidx.compose.ui.unit.dp
+
+/**
+ * State for the bottom AI-activity strip. Replaces the debug Toast that
+ * used to surface router decisions on every `generateXxx` call. Auto-hide
+ * is a presentation concern and lives in the caller — this composable just
+ * renders whatever it's told to.
+ */
+sealed class AiActivityState {
+    data object Hidden : AiActivityState()
+
+    /** A provider was picked; inference is in flight. Shows a spinner. */
+    data class Running(
+        val action: String,
+        val providerName: String,
+    ) : AiActivityState()
+
+    /** Inference finished. Shows the wall-clock duration. */
+    data class Completed(
+        val action: String,
+        val providerName: String,
+        val durationMs: Long,
+        val success: Boolean,
+    ) : AiActivityState()
+
+    /** Router could not pick a provider — UI surfaces the reason. */
+    data class NoProvider(
+        val action: String,
+        val reason: String,
+    ) : AiActivityState()
+}
+
+/**
+ * Slim auto-hiding strip rendered at the bottom of the app. Designed as the
+ * non-obstructive replacement for the debug Toast spam: a single line that
+ * slides up while AI work is in flight, then briefly shows the timing, then
+ * slides away. Place it at the bottom of a Box that wraps your screen
+ * content; it will draw above everything else when [state] is not Hidden.
+ *
+ * Auto-hide of [Completed] / [NoProvider] is the caller's responsibility —
+ * use a `LaunchedEffect(state)` with a `delay` and reset to [Hidden].
+ */
+@Composable
+fun AiActivityStatusLine(
+    state: AiActivityState,
+    modifier: Modifier = Modifier,
+) {
+    AnimatedVisibility(
+        visible = state !is AiActivityState.Hidden,
+        enter = fadeIn() + slideInVertically(initialOffsetY = { it }),
+        exit = fadeOut() + slideOutVertically(targetOffsetY = { it }),
+        modifier = modifier,
+    ) {
+        val (text, isRunning, isError) = when (state) {
+            is AiActivityState.Running -> Triple(
+                "${state.action}: ${state.providerName}…",
+                true,
+                false,
+            )
+            is AiActivityState.Completed -> Triple(
+                buildString {
+                    append(state.action)
+                    append(" via ")
+                    append(state.providerName)
+                    append(" — ")
+                    append(formatMs(state.durationMs))
+                    if (!state.success) append(" (failed)")
+                },
+                false,
+                !state.success,
+            )
+            is AiActivityState.NoProvider -> Triple(
+                "${state.action}: no model ready (${state.reason})",
+                false,
+                true,
+            )
+            // Hidden never reached here — AnimatedVisibility hides the slot.
+            AiActivityState.Hidden -> Triple("", false, false)
+        }
+
+        Surface(
+            color = if (isError) MaterialTheme.colorScheme.errorContainer
+                    else MaterialTheme.colorScheme.surfaceVariant,
+            contentColor = if (isError) MaterialTheme.colorScheme.onErrorContainer
+                           else MaterialTheme.colorScheme.onSurfaceVariant,
+            tonalElevation = 4.dp,
+            shadowElevation = 4.dp,
+        ) {
+            Row(
+                modifier = Modifier
+                    .fillMaxWidth()
+                    .padding(horizontal = 12.dp, vertical = 6.dp),
+                horizontalArrangement = Arrangement.spacedBy(8.dp),
+                verticalAlignment = Alignment.CenterVertically,
+            ) {
+                if (isRunning) {
+                    CircularProgressIndicator(
+                        modifier = Modifier.size(14.dp),
+                        strokeWidth = 2.dp,
+                        color = MaterialTheme.colorScheme.onSurfaceVariant,
+                    )
+                }
+                Text(
+                    text = text,
+                    style = MaterialTheme.typography.bodySmall,
+                    fontFamily = FontFamily.Monospace,
+                    color = Color.Unspecified,
+                )
+            }
+        }
+    }
+}
+
+private fun formatMs(ms: Long): String {
+    if (ms < 1000) return "$ms ms"
+    // Two decimal places without depending on String.format (not in commonMain).
+    val whole = ms / 1000
+    val hundredths = (ms % 1000) / 10
+    val padded = if (hundredths < 10) "0$hundredths" else "$hundredths"
+    return "$whole.$padded s"
+}

From 1ad30bc4c2e61ab23533a234c22a755bb10d2b4c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dirk=20Ja=CC=88ckel?= <github@dirk.jaeckel.name>
Date: Wed, 29 Apr 2026 12:37:58 +0200
Subject: [PATCH 02/20] Wire HF token end-to-end and fix gated-model 401s
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three connected changes for downloading gated LiteRT-LM / Gemma bundles:

1. Stop sending Authorization on cross-origin redirects in
   ModelDownloadManager. HF 302s gated downloads to a pre-signed
   cas-bridge.xethub.hf.co URL that already authenticates via
   query-string token; an extra Bearer header makes the CDN return
   401. Track the original host and only attach the header while the
   request is on it.

2. Add a Settings UI for the HF token (HuggingFaceTokenRow in
   SettingsScreen). The infrastructure
   (LocalModelPreferences.saveHfToken / loadHfToken,
   EncryptedSharedPreferences) already existed but no screen ever
   wrote the value. The row also surfaces a build-time fallback so
   users of CI builds don't need to paste anything.

3. Bake an optional default into the APK from `HF_TOKEN`:
   - androidApp/build.gradle.kts reads HF_TOKEN env var or the
     `hfToken` property in <repo>/local.properties (gitignored) and
     emits BuildConfig.HF_TOKEN_DEFAULT, with a non-token-character
     filter so the generated string literal is always safe.
   - LocalModelPreferences.loadHfToken() now falls back to the
     BuildConfig field when the user hasn't saved one.
   - build.yml and release.yml workflows pass HF_TOKEN through from
     a GitHub repo secret. Both still build with the secret absent.

Plus two unrelated polish items folded in:

- BookmarkListScreen: replace the 🔄 / ⚙️ emoji in the top app bar
  with Icons.Default.Sync / Icons.Default.Settings (added
  compose.materialIconsExtended to shared/commonMain).
- README: new "Hugging Face Token (gated models)" section explaining
  the user-entered, env-var, and local.properties paths plus the
  one-time per-repo licence acknowledgement step.
---
 .github/workflows/build.yml                   |  6 ++
 .github/workflows/release.yml                 |  5 ++
 README.md                                     | 40 +++++++++
 androidApp/build.gradle.kts                   | 26 ++++++
 .../jaeckel/urlvault/android/MainActivity.kt  | 11 +++
 .../android/ai/LocalModelPreferences.kt       | 23 ++++-
 .../android/ai/ModelDownloadManager.kt        | 26 ++++--
 shared/build.gradle.kts                       |  1 +
 .../jaeckel/urlvault/ui/BookmarkListScreen.kt | 15 ++--
 .../com/jaeckel/urlvault/ui/SettingsScreen.kt | 89 +++++++++++++++++++
 10 files changed, 228 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 62a571b..871207d 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -34,6 +34,12 @@ jobs:
         run: ./gradlew :shared:check --no-daemon
 
       - name: Build Android debug APK
+        env:
+          # Optional Hugging Face read-token baked into the APK at build
+          # time; needed to download gated LiteRT-LM / Gemma model bundles
+          # without the user pasting one into Settings. Repository secret;
+          # build still succeeds if absent (token defaults to empty).
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
         run: ./gradlew :androidApp:assembleDebug --no-daemon
 
       - name: Resolve APK artifact name
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 7c13f0c..9bf031c 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -49,6 +49,11 @@ jobs:
           ANDROID_STORE_PASSWORD: ${{ secrets.ANDROID_STORE_PASSWORD }}
           ANDROID_KEY_ALIAS: ${{ secrets.ANDROID_KEY_ALIAS }}
           ANDROID_KEY_PASSWORD: ${{ secrets.ANDROID_KEY_PASSWORD }}
+          # Optional Hugging Face read-token baked into the APK so gated
+          # LiteRT-LM / Gemma bundles can be downloaded without the user
+          # pasting a token. Empty / absent secret leaves the default empty
+          # and the user is prompted in Settings.
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
         run: |
           ./gradlew :androidApp:assembleRelease --no-daemon \
             -PappVersion=${{ steps.version.outputs.version }} \
diff --git a/README.md b/README.md
index 2b18181..0bbdb93 100644
--- a/README.md
+++ b/README.md
@@ -111,6 +111,46 @@ Per-platform repository implementations:
 ./gradlew :shared:build                 # Build the KMP library
 ```
 
+### Hugging Face Token (gated models)
+
+Most LiteRT-LM models in the catalog (Gemma 3, Gemma 4, FunctionGemma) are *gated* on Hugging Face — the API will reject downloads with HTTP 401 until two things are true:
+
+1. You hold a Hugging Face access token with **read** scope. Create one at <https://huggingface.co/settings/tokens>.
+2. You've accepted each model's licence on its HF page (e.g. <https://huggingface.co/google/gemma-3-1b-it>). Acceptance is per-repo and is a one-time click on the web UI.
+
+URLVault accepts the token from three sources, in this order of precedence:
+
+1. **User-entered** — Settings → Local AI Models → "Hugging Face token". Stored in `EncryptedSharedPreferences` on the device. Best for personal builds.
+2. **Build-time `HF_TOKEN` env var** — read by `androidApp/build.gradle.kts` and exposed as `BuildConfig.HF_TOKEN_DEFAULT`. Used by CI.
+3. **Build-time `hfToken` in `local.properties`** — same destination, fallback when the env var is absent. Used by local developer builds.
+
+The Settings row reads "Using token bundled with this build" when sources 2 or 3 are present and the user hasn't entered one of their own.
+
+#### Local developer builds
+
+Add a single line to `local.properties` at the repo root (already in `.gitignore` — the token never leaves your machine):
+
+```properties
+hfToken=hf_xxxxxxxxxxxxxxxxxxxx
+```
+
+After that, `./gradlew :androidApp:assembleDebug` and `./gradlew :androidApp:installDebug` will pick the token up automatically. Or set the env var per-invocation:
+
+```bash
+HF_TOKEN=hf_xxxxxxxxxxxxxxxxxxxx ./gradlew :androidApp:assembleDebug
+```
+
+#### CI builds
+
+Add `HF_TOKEN` as a repository secret on GitHub:
+
+- *Settings → Secrets and variables → Actions → New repository secret*, name `HF_TOKEN`.
+- The existing `build.yml` and `release.yml` workflows already read it. With the secret absent, builds still succeed and the APK ships with the field empty (the user is prompted in Settings).
+
+> **Security note.** Anything baked into the APK can be recovered by reverse-engineering. Only ship a *read-only* token that is acceptable for the people who will install the build. The user-entered path stores the token in EncryptedSharedPreferences (Android Keystore-wrapped) and is the safer default for shared / public builds.
+
+The downloader scrubs the `Authorization` header on cross-origin redirects (HF 302s gated downloads to a pre-signed CDN URL on `cas-bridge.xethub.hf.co`, which would otherwise reject the extra header with 401), so the token only travels to `huggingface.co` itself.
+
 ### iOS
 
 1. Open `iosApp/iosApp.xcodeproj` in Xcode
diff --git a/androidApp/build.gradle.kts b/androidApp/build.gradle.kts
index 5398330..c669085 100644
--- a/androidApp/build.gradle.kts
+++ b/androidApp/build.gradle.kts
@@ -1,6 +1,7 @@
 import org.jetbrains.kotlin.gradle.dsl.JvmTarget
 import java.nio.file.Files
 import java.util.Base64
+import java.util.Properties
 
 plugins {
     alias(libs.plugins.android.application)
@@ -42,6 +43,31 @@ android {
 
         testInstrumentationRunner = "androidx.test.runner.AndroidJUnitRunner"
 
+        // Optional Hugging Face read-token, baked into the APK at build time
+        // so the downloader can fetch gated LiteRT-LM bundles without the
+        // user pasting a token. Two sources, in order of precedence:
+        //   1. HF_TOKEN env var — used by CI (GitHub Actions secret).
+        //   2. `hfToken` property in <repo>/local.properties — used for
+        //      local developer builds. local.properties is gitignored so
+        //      the token never leaves the developer's machine.
+        // Empty default lets the build succeed without either; the user can
+        // paste a token into the Settings screen instead.
+        // Whitespace and any non-token characters are stripped to keep the
+        // generated string literal safe — real HF tokens are alphanumeric
+        // with `_` / `-`. Note: anything baked into the APK is recoverable
+        // via reverse engineering — only ship a *read-only* HF token here.
+        val hfTokenFromLocalProps: String? = rootProject.file("local.properties")
+            .takeIf { it.exists() }
+            ?.let { f ->
+                val props = Properties()
+                f.inputStream().use { stream -> props.load(stream) }
+                props.getProperty("hfToken")
+            }
+        val hfTokenDefault = (System.getenv("HF_TOKEN") ?: hfTokenFromLocalProps ?: "")
+            .trim()
+            .filter { it.isLetterOrDigit() || it == '_' || it == '-' }
+        buildConfigField("String", "HF_TOKEN_DEFAULT", "\"$hfTokenDefault\"")
+
         // Llamatik ships native libs for arm64-v8a, armeabi-v7a, x86, x86_64.
         // libllama_jni.so alone is ~23 MB per ABI; restricting to arm64-v8a cuts
         // ~90 MB of unused code from the APK. Every supported Android device
diff --git a/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/MainActivity.kt b/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/MainActivity.kt
index 32c481c..bd811b8 100644
--- a/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/MainActivity.kt
+++ b/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/MainActivity.kt
@@ -90,6 +90,11 @@ class MainActivity : ComponentActivity() {
                 val warmingIds by localModelRouter.warmingIds.collectAsState()
                 var customEntries by remember { mutableStateOf(localModelPrefs.loadCustomEntries()) }
                 var activeIds by remember { mutableStateOf(localModelPrefs.loadActiveIds()) }
+                // The user-only token (the build-time fallback isn't shown as
+                // a saved value — the row says "Using token bundled with this
+                // build" instead).
+                var hfToken by remember { mutableStateOf(localModelPrefs.loadUserHfToken().orEmpty()) }
+                val hfTokenFromBuild = remember { localModelPrefs.hasBuildTimeHfToken() }
                 // Settings reads two heavy values from EncryptedSharedPreferences:
                 // the Bitwarden credentials (decrypts via Keystore) and the
                 // field-history blob. Cache them in remembered state and only
@@ -306,6 +311,12 @@ class MainActivity : ComponentActivity() {
                                 // generate() call doesn't pay model-load cost.
                                 if (active) appScope.launch { localModelRouter.warmUpActive() }
                             },
+                            hfToken = hfToken,
+                            hfTokenFromBuild = hfTokenFromBuild,
+                            onHfTokenChanged = { newToken ->
+                                hfToken = newToken
+                                localModelPrefs.saveHfToken(newToken)
+                            },
                             onAddCustomModel = { hfRepo, hfFile, displayName ->
                                 val newEntry = ModelCatalogEntry(
                                     id = "custom:" + hfRepo.lowercase().replace('/', '_') + ":" + hfFile.lowercase(),
diff --git a/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LocalModelPreferences.kt b/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LocalModelPreferences.kt
index 303da5e..6627ecf 100644
--- a/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LocalModelPreferences.kt
+++ b/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LocalModelPreferences.kt
@@ -5,6 +5,7 @@ import android.content.SharedPreferences
 import androidx.security.crypto.EncryptedSharedPreferences
 import androidx.security.crypto.MasterKey
 import com.jaeckel.urlvault.ai.ModelCatalogEntry
+import com.jaeckel.urlvault.android.BuildConfig
 import kotlinx.serialization.encodeToString
 import kotlinx.serialization.json.Json
 
@@ -51,7 +52,24 @@ class LocalModelPreferences(private val context: Context) {
         prefs.edit().putStringSet(KEY_ACTIVE_IDS, ids.toSet()).apply()
     }
 
-    fun loadHfToken(): String? = prefs.getString(KEY_HF_TOKEN, null)
+    /**
+     * User-saved token wins; if blank, fall back to [BuildConfig.HF_TOKEN_DEFAULT]
+     * so a CI build that injected `HF_TOKEN` can ship gated-model access without
+     * any user action. Returns null when neither source has a token.
+     */
+    fun loadHfToken(): String? {
+        val saved = prefs.getString(KEY_HF_TOKEN, null)?.takeIf { it.isNotBlank() }
+        if (saved != null) return saved
+        return BuildConfig.HF_TOKEN_DEFAULT.takeIf { it.isNotBlank() }
+    }
+
+    /**
+     * The literal user-entered value (without the build-time fallback) so the
+     * Settings UI can show "(none)" vs. "saved: hf_…" honestly. Use
+     * [loadHfToken] for the value the downloader should actually send.
+     */
+    fun loadUserHfToken(): String? =
+        prefs.getString(KEY_HF_TOKEN, null)?.takeIf { it.isNotBlank() }
 
     fun saveHfToken(token: String?) {
         prefs.edit().apply {
@@ -59,6 +77,9 @@ class LocalModelPreferences(private val context: Context) {
         }.apply()
     }
 
+    /** True iff the APK was built with a non-empty `HF_TOKEN` env var. */
+    fun hasBuildTimeHfToken(): Boolean = BuildConfig.HF_TOKEN_DEFAULT.isNotBlank()
+
     companion object {
         private const val PREFS_NAME = "urlvault_local_models_encrypted"
         private const val KEY_CUSTOM_ENTRIES = "custom_entries"
diff --git a/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/ModelDownloadManager.kt b/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/ModelDownloadManager.kt
index 0515e2d..bb49a68 100644
--- a/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/ModelDownloadManager.kt
+++ b/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/ModelDownloadManager.kt
@@ -290,14 +290,16 @@ class ModelDownloadManager(
      * Probe the server for the file's total size without downloading the
      * whole thing. Sends `Range: bytes=0-0` (a 1-byte slice) so the response
      * carries a `Content-Range: bytes 0-0/<total>` header we can parse.
-     * Follows the same manual-redirect chain as openWithRedirects to keep
-     * the Authorization header attached on CDN redirects. Returns -1 if the
+     * Follows the same manual-redirect chain as openWithRedirects, mirroring
+     * its same-host Authorization rule (see notes there). Returns -1 if the
      * server doesn't report a total (e.g. on a non-Range-capable origin).
      */
     private fun discoverTotalBytes(urlString: String, token: String?, maxHops: Int = 5): Long {
+        val originalHost = URL(urlString).host
         var url = URL(urlString)
         var hops = 0
         while (true) {
+            val sameHost = url.host.equals(originalHost, ignoreCase = true)
             val conn = (url.openConnection() as HttpURLConnection).apply {
                 requestMethod = "GET"
                 connectTimeout = 30_000
@@ -305,7 +307,9 @@ class ModelDownloadManager(
                 instanceFollowRedirects = false
                 setRequestProperty("User-Agent", "URLVault/1.0")
                 setRequestProperty("Range", "bytes=0-0")
-                if (!token.isNullOrBlank()) setRequestProperty("Authorization", "Bearer $token")
+                if (sameHost && !token.isNullOrBlank()) {
+                    setRequestProperty("Authorization", "Bearer $token")
+                }
             }
             try {
                 val code = conn.responseCode
@@ -338,9 +342,13 @@ class ModelDownloadManager(
     }
 
     /**
-     * Follow up to 5 redirects manually so we re-apply the Authorization /
-     * Range headers on each hop (HttpURLConnection's automatic redirect
-     * stripping would otherwise drop them).
+     * Follow up to 5 redirects manually so we re-apply the Range header on
+     * each hop (HttpURLConnection would otherwise drop it). The Authorization
+     * header is only attached while we are still on the **original host** —
+     * Hugging Face 302s gated downloads to a pre-signed CDN URL on
+     * `cas-bridge.xethub.hf.co` (and similar), and that CDN rejects extra
+     * `Authorization: Bearer …` headers with HTTP 401. Browsers and curl
+     * drop auth across origins for exactly the same reason.
      */
     private fun openWithRedirects(
         urlString: String,
@@ -348,9 +356,11 @@ class ModelDownloadManager(
         token: String?,
         maxHops: Int = 5,
     ): OpenResult {
+        val originalHost = URL(urlString).host
         var url = URL(urlString)
         var hops = 0
         while (true) {
+            val sameHost = url.host.equals(originalHost, ignoreCase = true)
             val conn = (url.openConnection() as HttpURLConnection).apply {
                 requestMethod = "GET"
                 connectTimeout = 30_000
@@ -358,7 +368,9 @@ class ModelDownloadManager(
                 instanceFollowRedirects = false
                 setRequestProperty("User-Agent", "URLVault/1.0")
                 if (rangeStart > 0) setRequestProperty("Range", "bytes=$rangeStart-")
-                if (!token.isNullOrBlank()) setRequestProperty("Authorization", "Bearer $token")
+                if (sameHost && !token.isNullOrBlank()) {
+                    setRequestProperty("Authorization", "Bearer $token")
+                }
             }
             val code = conn.responseCode
             when (code) {
diff --git a/shared/build.gradle.kts b/shared/build.gradle.kts
index 1a558aa..a2ab916 100644
--- a/shared/build.gradle.kts
+++ b/shared/build.gradle.kts
@@ -37,6 +37,7 @@ kotlin {
             implementation(compose.runtime)
             implementation(compose.foundation)
             implementation(compose.material3)
+            implementation(compose.materialIconsExtended)
             implementation(compose.ui)
             implementation(compose.components.resources)
             implementation(compose.components.uiToolingPreview)
diff --git a/shared/src/commonMain/kotlin/com/jaeckel/urlvault/ui/BookmarkListScreen.kt b/shared/src/commonMain/kotlin/com/jaeckel/urlvault/ui/BookmarkListScreen.kt
index 387f199..cf103cd 100644
--- a/shared/src/commonMain/kotlin/com/jaeckel/urlvault/ui/BookmarkListScreen.kt
+++ b/shared/src/commonMain/kotlin/com/jaeckel/urlvault/ui/BookmarkListScreen.kt
@@ -17,6 +17,9 @@ import androidx.compose.foundation.layout.width
 import androidx.compose.foundation.lazy.LazyColumn
 import androidx.compose.foundation.lazy.LazyRow
 import androidx.compose.foundation.lazy.items
+import androidx.compose.material.icons.Icons
+import androidx.compose.material.icons.filled.Settings
+import androidx.compose.material.icons.filled.Sync
 import androidx.compose.material3.Card
 import androidx.compose.material3.CardDefaults
 import androidx.compose.material3.CircularProgressIndicator
@@ -117,17 +120,17 @@ fun BookmarkListScreen(
                                 strokeWidth = 2.dp
                             )
                         } else {
-                            Text(
-                                text = "\uD83D\uDD04",
-                                style = MaterialTheme.typography.titleMedium
+                            Icon(
+                                imageVector = Icons.Default.Sync,
+                                contentDescription = "Sync with Bitwarden",
                             )
                         }
                     }
                     // Settings button
                     IconButton(onClick = onOpenSettings) {
-                        Text(
-                            text = "\u2699\uFE0F",
-                            style = MaterialTheme.typography.titleMedium
+                        Icon(
+                            imageVector = Icons.Default.Settings,
+                            contentDescription = "Settings",
                         )
                     }
                 }
diff --git a/shared/src/commonMain/kotlin/com/jaeckel/urlvault/ui/SettingsScreen.kt b/shared/src/commonMain/kotlin/com/jaeckel/urlvault/ui/SettingsScreen.kt
index e3c8fb0..05e45a3 100644
--- a/shared/src/commonMain/kotlin/com/jaeckel/urlvault/ui/SettingsScreen.kt
+++ b/shared/src/commonMain/kotlin/com/jaeckel/urlvault/ui/SettingsScreen.kt
@@ -78,6 +78,9 @@ fun SettingsScreen(
     onDeleteModel: (ModelCatalogEntry) -> Unit = {},
     onToggleModelActive: (ModelCatalogEntry, Boolean) -> Unit = { _, _ -> },
     onAddCustomModel: (hfRepo: String, hfFile: String, displayName: String) -> Unit = { _, _, _ -> },
+    hfToken: String = "",
+    hfTokenFromBuild: Boolean = false,
+    onHfTokenChanged: (String) -> Unit = {},
     onOpenComparison: () -> Unit = {},
     onSaveCredentials: (BitwardenCredentials) -> Unit,
     onNavigateBack: () -> Unit,
@@ -438,6 +441,12 @@ fun SettingsScreen(
                     }
                 }
 
+                HuggingFaceTokenRow(
+                    token = hfToken,
+                    fromBuild = hfTokenFromBuild,
+                    onTokenChanged = onHfTokenChanged,
+                )
+
                 CustomModelEntryRow(onAdd = onAddCustomModel)
 
                 Button(
@@ -690,6 +699,86 @@ private fun ModelCatalogRow(
     }
 }
 
+/**
+ * Lets the user paste a Hugging Face access token so the downloader can
+ * fetch gated repos (most LiteRT-LM Gemma bundles, FunctionGemma, etc.).
+ * Acceptance of each model's licence on huggingface.co is also required —
+ * the token alone doesn't grant access.
+ */
+@Composable
+private fun HuggingFaceTokenRow(
+    token: String,
+    fromBuild: Boolean,
+    onTokenChanged: (String) -> Unit,
+) {
+    // When neither a user-saved nor a build-time token exists, default the
+    // row to expanded so the user is nudged to enter one.
+    var expanded by remember(token, fromBuild) {
+        mutableStateOf(token.isBlank() && !fromBuild)
+    }
+    val masked = if (token.isBlank()) "" else token.take(4) + "…" + token.takeLast(4)
+
+    Column(
+        modifier = Modifier.fillMaxWidth(),
+        verticalArrangement = Arrangement.spacedBy(4.dp),
+    ) {
+        Row(
+            modifier = Modifier.fillMaxWidth(),
+            verticalAlignment = Alignment.CenterVertically,
+            horizontalArrangement = Arrangement.SpaceBetween,
+        ) {
+            Column(modifier = Modifier.weight(1f)) {
+                Text("Hugging Face token", style = MaterialTheme.typography.bodyLarge)
+                Text(
+                    text = when {
+                        token.isNotBlank() -> "Saved: $masked"
+                        fromBuild -> "Using token bundled with this build"
+                        else -> "Required for gated models (Gemma, FunctionGemma)"
+                    },
+                    style = MaterialTheme.typography.bodySmall,
+                    color = MaterialTheme.colorScheme.onSurfaceVariant,
+                )
+            }
+            Switch(checked = expanded, onCheckedChange = { expanded = it })
+        }
+        if (expanded) {
+            var draft by remember(token) { mutableStateOf(token) }
+            OutlinedTextField(
+                value = draft,
+                onValueChange = { draft = it },
+                label = { Text("hf_… (read access)") },
+                singleLine = true,
+                modifier = Modifier.fillMaxWidth(),
+            )
+            Text(
+                text = "Create one at huggingface.co/settings/tokens, then accept each gated " +
+                    "model's licence on its page (e.g. google/gemma-3-1b-it).",
+                style = MaterialTheme.typography.bodySmall,
+                color = MaterialTheme.colorScheme.onSurfaceVariant,
+            )
+            Row(horizontalArrangement = Arrangement.spacedBy(8.dp)) {
+                Button(
+                    onClick = {
+                        onTokenChanged(draft.trim())
+                        expanded = false
+                    },
+                    enabled = draft.trim() != token,
+                    modifier = Modifier.weight(1f),
+                ) { Text(if (token.isBlank()) "Save token" else "Update token") }
+                if (token.isNotBlank()) {
+                    Button(
+                        onClick = {
+                            onTokenChanged("")
+                            expanded = false
+                        },
+                        modifier = Modifier.weight(1f),
+                    ) { Text("Clear") }
+                }
+            }
+        }
+    }
+}
+
 @Composable
 private fun CustomModelEntryRow(
     onAdd: (hfRepo: String, hfFile: String, displayName: String) -> Unit,

From 48a8e5729786e96315b3ed4f2ead39df4f1f6957 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dirk=20Ja=CC=88ckel?= <github@dirk.jaeckel.name>
Date: Wed, 29 Apr 2026 12:55:06 +0200
Subject: [PATCH 03/20] Address PR #12 review: monotonic timing, robust
 Completed emit, DROP_OLDEST
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- runTimed now uses System.nanoTime() instead of System.currentTimeMillis()
  so wall-clock jumps (NTP / manual changes) can't produce negative or
  inflated durationMs. Pointed out by Gemini and Copilot independently.

- runTimed wraps the call in try/finally so RouteEvent.Completed is always
  emitted, including when block() throws (notably CancellationException,
  which runCatching re-raises). Without this, a cancelled call left the
  status strip stuck in "Running…" until the next event fired. Matches the
  KDoc claim that Completed fires after the call "returns or throws".

- _events now uses BufferOverflow.DROP_OLDEST. With Picked + Completed pairs
  per call, a slow / backgrounded collector could overflow the 16-slot
  buffer; tryEmit dropping the latest event would leave the UI stuck. With
  DROP_OLDEST, tryEmit always succeeds and the consumer eventually catches
  up to current state.

- AiActivityStatusLine KDoc rewritten to match the actual usage in
  MainActivity (last child of a Column, content above given weight(1f))
  rather than the original "in an overlaying Box" advice that would
  reintroduce the obscuring behaviour the strip was meant to fix.

The Copilot suggestion to make `block: suspend () -> Result<T>` (and
crossinline) was checked but not applied — `runTimed` is `suspend inline`
already, which is exactly the construct that lets a non-suspending lambda
parameter call suspending provider methods (the body is inlined into the
suspend caller). The existing form compiles and runs correctly.
---
 .../urlvault/android/ai/LocalModelRouter.kt   | 57 ++++++++++++++-----
 .../urlvault/ui/AiActivityStatusLine.kt       | 15 +++--
 2 files changed, 53 insertions(+), 19 deletions(-)

diff --git a/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LocalModelRouter.kt b/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LocalModelRouter.kt
index 704086b..f0e9d66 100644
--- a/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LocalModelRouter.kt
+++ b/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LocalModelRouter.kt
@@ -3,6 +3,7 @@ package com.jaeckel.urlvault.android.ai
 import android.util.Log
 import com.jaeckel.urlvault.ai.LocalModelProvider
 import com.jaeckel.urlvault.ai.LocalModelRegistry
+import kotlinx.coroutines.channels.BufferOverflow
 import kotlinx.coroutines.flow.MutableSharedFlow
 import kotlinx.coroutines.flow.MutableStateFlow
 import kotlinx.coroutines.flow.SharedFlow
@@ -70,7 +71,14 @@ class LocalModelRouter(
         ) : RouteEvent()
     }
 
-    private val _events = MutableSharedFlow<RouteEvent>(extraBufferCapacity = 16)
+    // DROP_OLDEST so a slow / backgrounded collector can never stall the
+    // generate path or silently lose the latest event. The UI only cares
+    // about *current* state, so dropping older Picked/Completed pairs is
+    // safer than letting tryEmit return false for the most recent one.
+    private val _events = MutableSharedFlow<RouteEvent>(
+        extraBufferCapacity = 16,
+        onBufferOverflow = BufferOverflow.DROP_OLDEST,
+    )
     val events: SharedFlow<RouteEvent> = _events.asSharedFlow()
 
     /**
@@ -227,25 +235,44 @@ class LocalModelRouter(
         )
     }
 
+    /**
+     * Times [block] and emits a [RouteEvent.Completed] regardless of how it
+     * exits — normal `Result` (success or failure), or thrown exception
+     * (notably coroutine cancellation, which `runCatching` re-raises). Without
+     * the try/finally, a cancellation would leave the UI strip stuck in
+     * "Running…" forever.
+     *
+     * `inline` is what lets the non-suspending `block` parameter actually call
+     * suspending provider methods — the lambda body is inlined into this
+     * `suspend` function's body, so it runs in a suspending context.
+     *
+     * `nanoTime` is monotonic; `currentTimeMillis` is wall-clock and can jump
+     * backwards on NTP / manual clock changes, producing negative durations.
+     */
     private suspend inline fun <T> runTimed(
         action: String,
         provider: LocalModelProvider,
         pick: PickResult,
         block: () -> Result<T>,
     ): Result<T> {
-        val t0 = System.currentTimeMillis()
-        val result = block()
-        _events.tryEmit(
-            RouteEvent.Completed(
-                action = action,
-                activeIds = pick.activeIds,
-                readiness = pick.readiness,
-                providerId = provider.id,
-                providerName = provider.displayName,
-                durationMs = System.currentTimeMillis() - t0,
-                success = result.isSuccess,
-            ),
-        )
-        return result
+        val t0 = System.nanoTime()
+        var success = false
+        try {
+            val result = block()
+            success = result.isSuccess
+            return result
+        } finally {
+            _events.tryEmit(
+                RouteEvent.Completed(
+                    action = action,
+                    activeIds = pick.activeIds,
+                    readiness = pick.readiness,
+                    providerId = provider.id,
+                    providerName = provider.displayName,
+                    durationMs = (System.nanoTime() - t0) / 1_000_000,
+                    success = success,
+                ),
+            )
+        }
     }
 }
diff --git a/shared/src/commonMain/kotlin/com/jaeckel/urlvault/ui/AiActivityStatusLine.kt b/shared/src/commonMain/kotlin/com/jaeckel/urlvault/ui/AiActivityStatusLine.kt
index 6164592..8dd765b 100644
--- a/shared/src/commonMain/kotlin/com/jaeckel/urlvault/ui/AiActivityStatusLine.kt
+++ b/shared/src/commonMain/kotlin/com/jaeckel/urlvault/ui/AiActivityStatusLine.kt
@@ -55,11 +55,18 @@ sealed class AiActivityState {
  * Slim auto-hiding strip rendered at the bottom of the app. Designed as the
  * non-obstructive replacement for the debug Toast spam: a single line that
  * slides up while AI work is in flight, then briefly shows the timing, then
- * slides away. Place it at the bottom of a Box that wraps your screen
- * content; it will draw above everything else when [state] is not Hidden.
+ * slides away.
  *
- * Auto-hide of [Completed] / [NoProvider] is the caller's responsibility —
- * use a `LaunchedEffect(state)` with a `delay` and reset to [Hidden].
+ * Add it as the **last child of your screen's Column** (with the screen
+ * content above it given `Modifier.weight(1f)`) so it claims real layout
+ * space when visible and pushes content up. Putting it in an overlaying
+ * `Box` will reintroduce the obscuring behaviour the original Toast had —
+ * the whole point of this strip is that buttons stay reachable while it's
+ * showing.
+ *
+ * Auto-hide of [AiActivityState.Completed] / [AiActivityState.NoProvider] is
+ * the caller's responsibility — use a `LaunchedEffect(state)` with a `delay`
+ * and reset to [AiActivityState.Hidden].
  */
 @Composable
 fun AiActivityStatusLine(

From b8cc1070691fa77f98961f5593c63f6a3e1525ce Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dirk=20Ja=CC=88ckel?= <github@dirk.jaeckel.name>
Date: Wed, 29 Apr 2026 13:01:27 +0200
Subject: [PATCH 04/20] Disable vision/audio backends in LiteRT-LM EngineConfig
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Every entry in ModelCatalog is text-only, but the bridge was passing
`visionBackend = backend` and `audioBackend = backend` to EngineConfig.
The SDK then probes for vision/audio sections in the .litertlm bundle
during initialize() and aborts text-only models with:

  Failed to create engine: NOT_FOUND: TF_LITE_VISION_ENCODER
  not found in the model.

Affected every text-only LiteRT-LM model — FunctionGemma 270M,
Gemma 3 270M, Qwen3 0.6B, etc. — on every backend (NPU → GPU → CPU
all fail the same way), so load() always threw.

Set visionBackend / audioBackend to null. When a true multi-modal
Gemma 4 E2B bundle lands later, set them per-entry instead of bridge-
wide.
---
 .../jaeckel/urlvault/android/ai/LiteRtLmSdkBridge.kt | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LiteRtLmSdkBridge.kt b/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LiteRtLmSdkBridge.kt
index 9a04063..2e2ad0b 100644
--- a/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LiteRtLmSdkBridge.kt
+++ b/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LiteRtLmSdkBridge.kt
@@ -106,12 +106,20 @@ class LiteRtLmSdkBridge(
                 for ((label, backend) in backendsToTry) {
                     val t0 = System.currentTimeMillis()
                     Log.i(TAG, "load: trying backend=$label for $absolutePath")
+                    // visionBackend / audioBackend left null: every entry in
+                    // ModelCatalog is text-only. Setting them to `backend`
+                    // tells the engine to enable those modalities, and
+                    // initialize() then fails with `NOT_FOUND:
+                    // TF_LITE_VISION_ENCODER not found in the model.` for
+                    // text-only bundles (FunctionGemma 270M, Gemma 3 270M,
+                    // Qwen3 0.6B, etc.). When a true multi-modal Gemma 4 E2B
+                    // bundle is added later, switch this on per-entry.
                     val candidate = Engine(
                         EngineConfig(
                             modelPath = absolutePath,
                             backend = backend,
-                            visionBackend = backend,
-                            audioBackend = backend,
+                            visionBackend = null,
+                            audioBackend = null,
                             maxNumTokens = null,
                             maxNumImages = null,
                             cacheDir = cacheDir.absolutePath,

From 6eb9496265e3f624f02b232e0198d3ea6f0a2b69 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dirk=20Ja=CC=88ckel?= <github@dirk.jaeckel.name>
Date: Wed, 29 Apr 2026 14:10:17 +0200
Subject: [PATCH 05/20] Reject LEAP degenerate description / title output
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Observed in the wild: for a page where LFM2-Extract had little to extract
from, the bridge returned

  {"description":":\",\",\",\",\",\",\",\",\",\",\",\",..."}

JSON shape is valid (the grammar constraint did its job) but the value
is degenerate punctuation repeated until the maxLength budget runs out —
the sampler picks whatever satisfies the grammar when the model has
nothing meaningful to say.

Two defences:

1. `looksDegenerate` heuristic — checks letter ratio, distinct-char
   count, and minimum length. Rejected output throws and the UI
   surfaces "AI description generation failed" instead of persisting
   `:","",...` as the bookmark's description. Applied to both
   description and title (tags pass through length / charset filters
   that would already catch this).

2. Strengthened the description prompt to:
   - explicitly require natural-language sentences and forbid
     punctuation-only output;
   - give an exact canonical fallback ("No summary available.") for
     when the supplied text doesn't have enough to extract, so the
     model has a way out instead of being cornered by the grammar.

Note that detection is the safety net; the prompt change is what we
hope reduces the failure rate. The pre-existing comment already warned
about this exact mode — it had regressed for thin pages.
---
 .../urlvault/android/ai/LeapModelProvider.kt  | 57 ++++++++++++++++---
 1 file changed, 50 insertions(+), 7 deletions(-)

diff --git a/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LeapModelProvider.kt b/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LeapModelProvider.kt
index 1627861..7520340 100644
--- a/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LeapModelProvider.kt
+++ b/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LeapModelProvider.kt
@@ -135,9 +135,21 @@ class LeapModelProvider(
         // this as "extract a summary from the supplied text" rather than
         // "write a description"; otherwise the model has nothing to extract,
         // the grammar still forces a non-empty string, and we get garbage
-        // (the original prompt produced a single-comma description).
+        // (the original prompt produced a single-comma description; a later
+        // observed regression produced `{"description":":\",\",..."}` —
+        // valid JSON shape, garbage value, when supplied text was thin).
+        // Defences against that mode:
+        //   - state explicitly that real natural-language sentences are
+        //     required and that punctuation-only output is wrong;
+        //   - give the model a concrete fallback to emit when there's
+        //     nothing to extract, so it doesn't have to invent garbage to
+        //     satisfy the grammar.
+        // The provider also rejects degenerate output post-hoc — see
+        // `looksDegenerate`.
         val task = buildString {
-            appendLine("Extract a 1-2 sentence summary describing what the web page below is about. Use only information present in the supplied text.")
+            appendLine("Extract a 1-2 sentence summary describing what the web page below is about, using only information present in the supplied text.")
+            appendLine("The summary must be real English (or German) sentences with normal words and spaces — never punctuation-only output.")
+            appendLine("If the supplied text does not contain enough information to summarise, return exactly: No summary available.")
             appendLine()
             appendLine("URL: $url")
             if (title.isNotBlank()) appendLine("Title: $title")
@@ -145,9 +157,9 @@ class LeapModelProvider(
                 appendLine("Page content:")
                 appendLine(pageSummary)
             } else {
-                // No page content fetched — give the model something concrete
-                // to extract from rather than asking it to invent prose.
-                appendLine("Page content: (unavailable — derive a one-sentence summary from the URL and title only)")
+                // No page content fetched — explicitly authorise the
+                // canonical fallback rather than asking for invented prose.
+                appendLine("Page content: (unavailable — return: No summary available.)")
             }
             appendLine()
             appendLine("Return the extracted summary as the \"description\" field.")
@@ -159,7 +171,12 @@ class LeapModelProvider(
         }
         Log.i(TAG, "[$id] description raw: $raw")
 
-        validateDescription(parseJson<DescriptionExtraction>(raw).description.trim())
+        val text = parseJson<DescriptionExtraction>(raw).description.trim()
+        if (looksDegenerate(text)) {
+            Log.w(TAG, "[$id] description rejected as degenerate: ${text.take(80)}")
+            error("Model produced degenerate output (no extractable content)")
+        }
+        validateDescription(text)
     }
 
     override suspend fun generateTitle(url: String): Result<String> = runCatching {
@@ -203,7 +220,33 @@ class LeapModelProvider(
         }
         Log.i(TAG, "[$id] title raw: $raw")
 
-        parseJson<TitleExtraction>(raw).title.trim().removeSurrounding("\"")
+        val text = parseJson<TitleExtraction>(raw).title.trim().removeSurrounding("\"")
+        if (looksDegenerate(text)) {
+            Log.w(TAG, "[$id] title rejected as degenerate: ${text.take(80)}")
+            error("Model produced degenerate output (no extractable content)")
+        }
+        text
+    }
+
+    /**
+     * Heuristic to catch the LFM2-Extract failure mode where the grammar-
+     * constrained sampler forces a non-empty string but the supplied text
+     * has nothing to extract — the model fills the budget with degenerate
+     * sequences like `:","","",...`. JSON shape is valid; value is garbage.
+     *
+     * Real natural-language output is mostly letters with reasonable
+     * character diversity. Reject anything that fails both bars so the UI
+     * surfaces "AI generation failed" instead of persisting garbage.
+     */
+    private fun looksDegenerate(text: String): Boolean {
+        val trimmed = text.trim()
+        if (trimmed.length < 5) return true
+        val letterCount = trimmed.count { it.isLetter() }
+        val letterRatio = letterCount.toDouble() / trimmed.length
+        if (letterRatio < 0.4) return true
+        val distinctChars = trimmed.toSet().size
+        if (distinctChars < 5) return true
+        return false
     }
 
     /**

From c7de63b360d67717f7b27ed240c4712272fbd8fa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dirk=20Ja=CC=88ckel?= <github@dirk.jaeckel.name>
Date: Wed, 29 Apr 2026 14:11:18 +0200
Subject: [PATCH 06/20] Fire AI tags even when description generation fails

The description-success LaunchedEffect previously chained tag
generation only on success. With degenerate-output rejection now
landing on the failure branch (and observed: same URL where LEAP's
description sampler went sideways produced perfectly clean tags),
falling back to "no tags either" leaves the user with nothing.

Tags are an independent extraction and don't need the description as
context. On description error, kick them off from URL + title alone.
---
 .../com/jaeckel/urlvault/ui/AddEditBookmarkScreen.kt   | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/shared/src/commonMain/kotlin/com/jaeckel/urlvault/ui/AddEditBookmarkScreen.kt b/shared/src/commonMain/kotlin/com/jaeckel/urlvault/ui/AddEditBookmarkScreen.kt
index 16fe6d2..1666dc7 100644
--- a/shared/src/commonMain/kotlin/com/jaeckel/urlvault/ui/AddEditBookmarkScreen.kt
+++ b/shared/src/commonMain/kotlin/com/jaeckel/urlvault/ui/AddEditBookmarkScreen.kt
@@ -236,6 +236,16 @@ fun AddEditBookmarkScreen(
             is AIGenerationState.Error -> {
                 aiDescriptionError = aiDescriptionState.message
                 onAiDescriptionConsumed()
+                // Description failed — but tags are an independent extraction
+                // and often succeed on the same input (observed: LEAP returned
+                // degenerate punctuation as the description while producing
+                // clean tags for the same URL). Fire tags from URL + title
+                // alone instead of giving up entirely.
+                val currentTarget = normalizeUrlForAi(url)
+                if (aiCoreEnabled && onAiGenerateTags != null && currentTarget != null) {
+                    aiTagError = null
+                    onAiGenerateTags(currentTarget, title, "")
+                }
             }
             else -> {}
         }

From 1a5a309414ddaab5d1b97fa48f3110b8fe9d1596 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dirk=20Ja=CC=88ckel?= <github@dirk.jaeckel.name>
Date: Wed, 29 Apr 2026 14:15:15 +0200
Subject: [PATCH 07/20] Re-trigger share-intent AI when aiCoreEnabled flips on
 after a delay
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When a URL is shared, MainActivity composes with aiCoreEnabled briefly
false because `anyProviderReady` is computed via produceState on
Dispatchers.Default and hasn't finished by the time
LaunchedEffect(prefilledUrl) fires in AddEditBookmarkScreen. That sent
the first trigger down the legacy metadata-extraction branch, and the
single-key LaunchedEffect never re-ran when aiCoreEnabled later flipped
to true — leaving the user with regex-extracted tags and no AI output.

Two prior fixes contributed to widening the race window:
  - tightening produceState keys (removed downloadStates) so the read
    is deferred to the first true state change rather than every tick,
  - moving the readiness probe off the main thread.
Both correct individually; together they make the startup gap visible.

Add aiCoreEnabled to the LaunchedEffect keys and pass force=true so
the second invocation bypasses the aiTriggeredForUrl dedup. The legacy
result-handling LaunchedEffects already gate their applies on
`if (!aiCoreEnabled)`, so once AI takes over the racing legacy results
are discarded harmlessly.
---
 .../jaeckel/urlvault/ui/AddEditBookmarkScreen.kt   | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/shared/src/commonMain/kotlin/com/jaeckel/urlvault/ui/AddEditBookmarkScreen.kt b/shared/src/commonMain/kotlin/com/jaeckel/urlvault/ui/AddEditBookmarkScreen.kt
index 1666dc7..2edb34b 100644
--- a/shared/src/commonMain/kotlin/com/jaeckel/urlvault/ui/AddEditBookmarkScreen.kt
+++ b/shared/src/commonMain/kotlin/com/jaeckel/urlvault/ui/AddEditBookmarkScreen.kt
@@ -274,12 +274,20 @@ fun AddEditBookmarkScreen(
         }
     }
 
-    // Auto-trigger once for prefilled URLs (share intent).
-    LaunchedEffect(prefilledUrl) {
+    // Auto-trigger for prefilled URLs (share intent). Keyed on
+    // `aiCoreEnabled` as well as `prefilledUrl` so a startup race —
+    // share intent fires before `anyProviderReady`'s async readiness
+    // probe has finished, so `aiCoreEnabled` is briefly false and the
+    // first trigger ends up on the legacy branch — gets corrected once
+    // AI flips on. `force = true` so the re-trigger is not deduped by
+    // `aiTriggeredForUrl`. The legacy LaunchedEffects' `if (!aiCoreEnabled)`
+    // guards already prevent stale legacy results from clobbering the AI
+    // values when this flip happens.
+    LaunchedEffect(prefilledUrl, aiCoreEnabled) {
         if (!isEditMode && prefilledUrl != null) {
             val targetUrl = normalizeUrlForAi(prefilledUrl)
             if (targetUrl != null) {
-                triggerAiForUrl(targetUrl)
+                triggerAiForUrl(targetUrl, force = true)
             }
         }
     }

From 6453641de7b6559c26a998816cb8abfa942eafe4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dirk=20Ja=CC=88ckel?= <github@dirk.jaeckel.name>
Date: Wed, 29 Apr 2026 14:22:56 +0200
Subject: [PATCH 08/20] Append debug provenance tag to AI-generated tag lists
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds a synthetic last tag of the form `dbg:<provider-id>@<HH:mm:ss>`
to every successful generateTags call in DEBUG builds, so a glance at
the saved bookmark tells you which SDK / model variant produced those
tags and when — useful when comparing LEAP vs llama.cpp vs LiteRT vs
AICore live during the talk demo.

Examples in saved bookmarks:
  dbg:leap:lfm2-1.2b-extract@14:13:14
  dbg:litertlm:qwen3-0.6b@14:14:02
  dbg:mlkit:gemini-nano-active@14:14:48

Stripped at the router level in release builds so synced Bitwarden
entries don't carry `dbg:…` tags into production. Only affects the
bookmark form path (BookmarkViewModel → router → provider). The
ModelComparisonRunner calls providers directly and is unaffected —
its UI already labels each row by provider.
---
 .../urlvault/android/ai/LocalModelRouter.kt   | 22 ++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LocalModelRouter.kt b/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LocalModelRouter.kt
index f0e9d66..285a93d 100644
--- a/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LocalModelRouter.kt
+++ b/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LocalModelRouter.kt
@@ -3,7 +3,10 @@ package com.jaeckel.urlvault.android.ai
 import android.util.Log
 import com.jaeckel.urlvault.ai.LocalModelProvider
 import com.jaeckel.urlvault.ai.LocalModelRegistry
+import com.jaeckel.urlvault.android.BuildConfig
 import kotlinx.coroutines.channels.BufferOverflow
+import java.time.LocalTime
+import java.time.format.DateTimeFormatter
 import kotlinx.coroutines.flow.MutableSharedFlow
 import kotlinx.coroutines.flow.MutableStateFlow
 import kotlinx.coroutines.flow.SharedFlow
@@ -13,6 +16,7 @@ import kotlinx.coroutines.flow.asStateFlow
 import kotlinx.coroutines.flow.update
 
 private const val TAG = "LocalModelRouter"
+private val DEBUG_TIME_FORMATTER = DateTimeFormatter.ofPattern("HH:mm:ss")
 
 /**
  * Selects which `LocalModelProvider` runs the bookmark AI calls. The selection
@@ -186,7 +190,23 @@ class LocalModelRouter(
             return Result.failure(IllegalStateException("No ready local AI model"))
         }
         emitPicked("tags", provider, pick)
-        return runTimed("tags", provider, pick) { provider.generateTags(url, title, content) }
+        val result = runTimed("tags", provider, pick) {
+            provider.generateTags(url, title, content)
+        }
+        // DEBUG-only: append a synthetic tag identifying which provider+time
+        // generated the list, so a glance at the saved bookmark tells you
+        // which SDK/model produced these tags. Stripped in release builds so
+        // synced Bitwarden entries don't end up with `dbg:…` in production.
+        return if (BuildConfig.DEBUG) {
+            result.map { it + debugProvenanceTag(provider) }
+        } else {
+            result
+        }
+    }
+
+    private fun debugProvenanceTag(provider: LocalModelProvider): String {
+        val time = LocalTime.now().format(DEBUG_TIME_FORMATTER)
+        return "dbg:${provider.id}@$time"
     }
 
     suspend fun generateDescription(url: String, title: String): Result<String> {

From 8e2c6c0053315f54cc2463d67dafa17931f6614c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dirk=20Ja=CC=88ckel?= <github@dirk.jaeckel.name>
Date: Wed, 29 Apr 2026 14:26:50 +0200
Subject: [PATCH 09/20] Debug provenance tag now shows generation duration,
 short SDK name

Was: dbg:leap:lfm2-1.2b-extract@14:13:14   (wall-clock time of generation)
Now: dbg:leap@2.34s                         (how long generation took)

Short SDK names map ModelRuntime to:
  ML_KIT      -> aicore
  LLAMA_CPP   -> llama
  LEAP        -> leap
  MEDIAPIPE   -> liteRt

Duration formatting: ms under 1s, two-decimal seconds above. Hand-
rolled rather than String.format to avoid host-locale dependence.
---
 .../urlvault/android/ai/LocalModelRouter.kt   | 37 +++++++++++++------
 1 file changed, 26 insertions(+), 11 deletions(-)

diff --git a/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LocalModelRouter.kt b/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LocalModelRouter.kt
index 285a93d..e7cc5bb 100644
--- a/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LocalModelRouter.kt
+++ b/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LocalModelRouter.kt
@@ -3,10 +3,9 @@ package com.jaeckel.urlvault.android.ai
 import android.util.Log
 import com.jaeckel.urlvault.ai.LocalModelProvider
 import com.jaeckel.urlvault.ai.LocalModelRegistry
+import com.jaeckel.urlvault.ai.ModelRuntime
 import com.jaeckel.urlvault.android.BuildConfig
 import kotlinx.coroutines.channels.BufferOverflow
-import java.time.LocalTime
-import java.time.format.DateTimeFormatter
 import kotlinx.coroutines.flow.MutableSharedFlow
 import kotlinx.coroutines.flow.MutableStateFlow
 import kotlinx.coroutines.flow.SharedFlow
@@ -16,7 +15,6 @@ import kotlinx.coroutines.flow.asStateFlow
 import kotlinx.coroutines.flow.update
 
 private const val TAG = "LocalModelRouter"
-private val DEBUG_TIME_FORMATTER = DateTimeFormatter.ofPattern("HH:mm:ss")
 
 /**
  * Selects which `LocalModelProvider` runs the bookmark AI calls. The selection
@@ -190,23 +188,40 @@ class LocalModelRouter(
             return Result.failure(IllegalStateException("No ready local AI model"))
         }
         emitPicked("tags", provider, pick)
+        val t0 = System.nanoTime()
         val result = runTimed("tags", provider, pick) {
             provider.generateTags(url, title, content)
         }
-        // DEBUG-only: append a synthetic tag identifying which provider+time
-        // generated the list, so a glance at the saved bookmark tells you
-        // which SDK/model produced these tags. Stripped in release builds so
-        // synced Bitwarden entries don't end up with `dbg:…` in production.
+        val durationMs = (System.nanoTime() - t0) / 1_000_000
+        // DEBUG-only: append a synthetic tag identifying which SDK ran and how
+        // long it took, so a glance at the saved bookmark tells you both at
+        // once. Stripped in release builds so synced Bitwarden entries never
+        // carry `dbg:…` tags into production.
         return if (BuildConfig.DEBUG) {
-            result.map { it + debugProvenanceTag(provider) }
+            result.map { it + debugProvenanceTag(provider, durationMs) }
         } else {
             result
         }
     }
 
-    private fun debugProvenanceTag(provider: LocalModelProvider): String {
-        val time = LocalTime.now().format(DEBUG_TIME_FORMATTER)
-        return "dbg:${provider.id}@$time"
+    private fun debugProvenanceTag(provider: LocalModelProvider, durationMs: Long): String {
+        val sdk = when (provider.runtime) {
+            ModelRuntime.ML_KIT -> "aicore"
+            ModelRuntime.LLAMA_CPP -> "llama"
+            ModelRuntime.LEAP -> "leap"
+            ModelRuntime.MEDIAPIPE -> "liteRt"
+        }
+        // ms below 1s, two-decimal seconds above. Avoids `String.format`
+        // (host-locale-dependent) by doing the math directly.
+        val duration = if (durationMs < 1000) {
+            "${durationMs}ms"
+        } else {
+            val whole = durationMs / 1000
+            val hundredths = (durationMs % 1000) / 10
+            val padded = if (hundredths < 10) "0$hundredths" else "$hundredths"
+            "$whole.${padded}s"
+        }
+        return "dbg:$sdk@$duration"
     }
 
     suspend fun generateDescription(url: String, title: String): Result<String> {

From 527f0faf34c40cf803a6a5796853bde3ca6837d5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dirk=20Ja=CC=88ckel?= <github@dirk.jaeckel.name>
Date: Wed, 29 Apr 2026 14:28:02 +0200
Subject: [PATCH 10/20] Drop dbg: prefix from debug provenance tag

---
 .../jaeckel/urlvault/android/ai/LocalModelRouter.kt    | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LocalModelRouter.kt b/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LocalModelRouter.kt
index e7cc5bb..21b9055 100644
--- a/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LocalModelRouter.kt
+++ b/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LocalModelRouter.kt
@@ -193,10 +193,10 @@ class LocalModelRouter(
             provider.generateTags(url, title, content)
         }
         val durationMs = (System.nanoTime() - t0) / 1_000_000
-        // DEBUG-only: append a synthetic tag identifying which SDK ran and how
-        // long it took, so a glance at the saved bookmark tells you both at
-        // once. Stripped in release builds so synced Bitwarden entries never
-        // carry `dbg:…` tags into production.
+        // DEBUG-only: append a synthetic tag identifying which SDK ran and
+        // how long it took (e.g. `leap@2.34s`), so a glance at the saved
+        // bookmark tells you both at once. Stripped in release builds so
+        // synced Bitwarden entries never carry the marker into production.
         return if (BuildConfig.DEBUG) {
             result.map { it + debugProvenanceTag(provider, durationMs) }
         } else {
@@ -221,7 +221,7 @@ class LocalModelRouter(
             val padded = if (hundredths < 10) "0$hundredths" else "$hundredths"
             "$whole.${padded}s"
         }
-        return "dbg:$sdk@$duration"
+        return "$sdk@$duration"
     }
 
     suspend fun generateDescription(url: String, title: String): Result<String> {

From 7e91f44c5d92f802e8a461139a43cf09a5df9eee Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dirk=20Ja=CC=88ckel?= <github@dirk.jaeckel.name>
Date: Wed, 29 Apr 2026 14:30:24 +0200
Subject: [PATCH 11/20] Include model name in debug provenance tag

---
 .../jaeckel/urlvault/android/ai/LocalModelRouter.kt | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LocalModelRouter.kt b/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LocalModelRouter.kt
index 21b9055..1079549 100644
--- a/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LocalModelRouter.kt
+++ b/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LocalModelRouter.kt
@@ -193,9 +193,10 @@ class LocalModelRouter(
             provider.generateTags(url, title, content)
         }
         val durationMs = (System.nanoTime() - t0) / 1_000_000
-        // DEBUG-only: append a synthetic tag identifying which SDK ran and
-        // how long it took (e.g. `leap@2.34s`), so a glance at the saved
-        // bookmark tells you both at once. Stripped in release builds so
+        // DEBUG-only: append a synthetic tag of the form
+        // `<sdk>:<model>:<duration>` (e.g. `leap:lfm2-1.2b-extract:2.34s`)
+        // so a glance at the saved bookmark tells you SDK, model variant,
+        // and how long generation took. Stripped in release builds so
         // synced Bitwarden entries never carry the marker into production.
         return if (BuildConfig.DEBUG) {
             result.map { it + debugProvenanceTag(provider, durationMs) }
@@ -211,6 +212,10 @@ class LocalModelRouter(
             ModelRuntime.LEAP -> "leap"
             ModelRuntime.MEDIAPIPE -> "liteRt"
         }
+        // provider.id is `<runtime-prefix>:<model-id>` (e.g.
+        // `leap:lfm2-1.2b-extract`); strip the prefix so we can substitute
+        // the shorter SDK name without duplicating the runtime label.
+        val model = provider.id.substringAfter(':', missingDelimiterValue = provider.id)
         // ms below 1s, two-decimal seconds above. Avoids `String.format`
         // (host-locale-dependent) by doing the math directly.
         val duration = if (durationMs < 1000) {
@@ -221,7 +226,7 @@ class LocalModelRouter(
             val padded = if (hundredths < 10) "0$hundredths" else "$hundredths"
             "$whole.${padded}s"
         }
-        return "$sdk@$duration"
+        return "$sdk:$model:$duration"
     }
 
     suspend fun generateDescription(url: String, title: String): Result<String> {

From bca82e8f88a524f1dbb6175015f4667090ae9aa8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dirk=20Ja=CC=88ckel?= <github@dirk.jaeckel.name>
Date: Wed, 29 Apr 2026 14:33:35 +0200
Subject: [PATCH 12/20] Show LiteRT-LM (not MEDIAPIPE) as runtime label in
 comparison rows

The ProviderResultCard rendered `result.runtime.name`, leaking the
enum constant. `ModelRuntime.MEDIAPIPE` is a historical leftover from
when the LiteRT-LM bundle was loaded via MediaPipe-LLM; the runtime
today is LiteRT-LM, so display it as such. Added a `runtimeLabel()`
helper that mirrors the same mapping already in SettingsScreen.
---
 .../jaeckel/urlvault/ui/ModelComparisonScreen.kt | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/shared/src/commonMain/kotlin/com/jaeckel/urlvault/ui/ModelComparisonScreen.kt b/shared/src/commonMain/kotlin/com/jaeckel/urlvault/ui/ModelComparisonScreen.kt
index 20cac0d..9e41f14 100644
--- a/shared/src/commonMain/kotlin/com/jaeckel/urlvault/ui/ModelComparisonScreen.kt
+++ b/shared/src/commonMain/kotlin/com/jaeckel/urlvault/ui/ModelComparisonScreen.kt
@@ -35,6 +35,7 @@ import androidx.compose.ui.text.font.FontFamily
 import androidx.compose.ui.text.font.FontWeight
 import androidx.compose.ui.unit.dp
 import com.jaeckel.urlvault.ai.ModelComparisonRunner
+import com.jaeckel.urlvault.ai.ModelRuntime
 import kotlinx.coroutines.launch
 
 @OptIn(ExperimentalMaterial3Api::class)
@@ -212,7 +213,7 @@ private fun ProviderResultCard(result: ModelComparisonRunner.ProviderResult) {
                     fontWeight = FontWeight.SemiBold,
                 )
                 Text(
-                    text = result.runtime.name,
+                    text = runtimeLabel(result.runtime),
                     style = MaterialTheme.typography.labelSmall,
                     color = MaterialTheme.colorScheme.onSurfaceVariant,
                 )
@@ -270,3 +271,16 @@ private fun ResultLine(label: String, value: String, ms: Long) {
         )
     }
 }
+
+/**
+ * Human-friendly label for a runtime. The enum name `MEDIAPIPE` is a
+ * historical leftover from when the LiteRT-LM bundle was loaded via
+ * MediaPipe-LLM; the actual runtime today is LiteRT-LM, so render it that
+ * way in the UI rather than leaking the enum constant.
+ */
+private fun runtimeLabel(runtime: ModelRuntime): String = when (runtime) {
+    ModelRuntime.ML_KIT -> "AICore"
+    ModelRuntime.LLAMA_CPP -> "llama.cpp"
+    ModelRuntime.LEAP -> "Leap"
+    ModelRuntime.MEDIAPIPE -> "LiteRT-LM"
+}

From ddcd2e64b582cb6ab82cd1dca617fc0cec2f8a38 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dirk=20Ja=CC=88ckel?= <github@dirk.jaeckel.name>
Date: Wed, 29 Apr 2026 14:35:35 +0200
Subject: [PATCH 13/20] Handle system back from Settings / AddEdit / Comparison
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Without a BackHandler, the Activity finishes on system back and the
user is dropped out to the home screen — not what tapping the in-screen
back arrow does. Mirror the same per-screen routing as the arrow buttons:
Comparison -> Settings, Settings/AddEdit -> List. Disabled on List so
the OS default (close the app) still applies at the root.
---
 .../com/jaeckel/urlvault/android/MainActivity.kt | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/MainActivity.kt b/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/MainActivity.kt
index bd811b8..09c9607 100644
--- a/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/MainActivity.kt
+++ b/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/MainActivity.kt
@@ -3,6 +3,7 @@ package com.jaeckel.urlvault.android
 import android.content.Intent
 import android.os.Bundle
 import androidx.activity.ComponentActivity
+import androidx.activity.compose.BackHandler
 import androidx.activity.compose.setContent
 import androidx.activity.enableEdgeToEdge
 import androidx.compose.foundation.layout.Box
@@ -181,6 +182,21 @@ class MainActivity : ComponentActivity() {
                     }
                 }
 
+                // Without an explicit BackHandler, the system back gesture
+                // bypasses our in-memory `currentScreen` state and finishes
+                // the Activity — i.e. tapping back from Settings exits the
+                // app instead of returning to the bookmark list. Mirror the
+                // in-screen back arrows: Comparison → Settings; Settings and
+                // AddEdit → List. List is the root, so the handler is
+                // disabled there and the OS default (finish) applies.
+                BackHandler(enabled = currentScreen !is Screen.List) {
+                    currentScreen = when (currentScreen) {
+                        is Screen.Comparison -> Screen.Settings
+                        is Screen.Settings, is Screen.AddEdit -> Screen.List
+                        is Screen.List -> Screen.List // unreachable
+                    }
+                }
+
                 Column(
                     // enableEdgeToEdge() lets content draw under the system
                     // bars; the two *barsPadding modifiers reserve space at

From 8d774bd965df3a7a934f992cc15184e47383fd56 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dirk=20Ja=CC=88ckel?= <github@dirk.jaeckel.name>
Date: Wed, 29 Apr 2026 14:39:33 +0200
Subject: [PATCH 14/20] Self-heal LiteRT-LM 'no OpenCL on this device' failure
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Symptom on Pixel 7a (Tensor G2): the GPU backend's `Engine.initialize()`
succeeds fine, then the first `Session.generateContent` call throws

  com.google.ai.edge.litertlm.LiteRtLmJniException:
    Failed to generate content: UNKNOWN: Can not find OpenCL library
    on this device

Cause: LiteRT-LM's Top-K sampler dlopens OpenCL even when the engine
runs through WebGPU, and Tensor doesn't ship OpenCL drivers. The
sampler-factory log shows it falling back to a statically-linked
OpenCL sampler, which then fails the same way. Generation is fatal;
load was happy.

Fix: catch the failure in runCollect, blocklist the broken backend
for the lifetime of the process, reload the same model on the next
candidate (NPU → GPU → CPU strategy means we land on CPU here), and
retry generation once. We hold the mutex throughout so no other
caller can race in mid-recovery.

Refactored:
- `load()` body extracted into `loadInternalLocked()` so recovery can
  re-enter it without dropping the mutex,
- `runCollect()` now wraps `runCollectOnce()` with the catch+retry,
- `runtimeBlockedBackends` set is filtered out at the start of
  `loadInternalLocked()` and surfaces a clear "all backends blocked"
  error if we exhaust them.

Detection is narrow today (`isRecoverableRuntimeError` only matches
'opencl'); widen as new device-specific failures show up.
---
 .../urlvault/android/ai/LiteRtLmSdkBridge.kt  | 183 ++++++++++++------
 1 file changed, 126 insertions(+), 57 deletions(-)

diff --git a/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LiteRtLmSdkBridge.kt b/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LiteRtLmSdkBridge.kt
index 2e2ad0b..f3ac8bd 100644
--- a/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LiteRtLmSdkBridge.kt
+++ b/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LiteRtLmSdkBridge.kt
@@ -70,6 +70,16 @@ class LiteRtLmSdkBridge(
     private var currentPath: String? = null
     private var currentBackend: String? = null
 
+    /**
+     * Backends that *initialised successfully* but then failed at runtime
+     * during `generateContent` (e.g. Pixel 7a's Tensor G2 GPU loads fine but
+     * the Top-K sampler tries to dlopen OpenCL and the Tensor stack has none,
+     * so `runCollect` throws `Can not find OpenCL library on this device`).
+     * Filtered out of subsequent loads in this process so the bridge doesn't
+     * keep redoing the same dance every call. Cleared on app process death.
+     */
+    private val runtimeBlockedBackends = mutableSetOf<String>()
+
     private val classLoaderProbe: Boolean by lazy {
         try {
             Class.forName("com.google.ai.edge.litertlm.Engine")
@@ -89,67 +99,85 @@ class LiteRtLmSdkBridge(
                 Log.v(TAG, "load: already loaded $absolutePath, no-op")
                 return
             }
-            withContext(Dispatchers.IO) {
-                engine?.let {
-                    Log.i(TAG, "load: switching model — closing previous $currentPath")
-                    runCatching { it.close() }
-                }
-                engine = null
-                currentPath = null
-                currentBackend = null
+            loadInternalLocked(absolutePath)
+        }
+    }
 
-                val cacheDir = File(context.cacheDir, "litertlm").also { it.mkdirs() }
-                val nativeLibDir = context.applicationInfo.nativeLibraryDir.orEmpty()
-                val backendsToTry = backendStrategy.candidates(nativeLibDir)
-
-                var lastError: Throwable? = null
-                for ((label, backend) in backendsToTry) {
-                    val t0 = System.currentTimeMillis()
-                    Log.i(TAG, "load: trying backend=$label for $absolutePath")
-                    // visionBackend / audioBackend left null: every entry in
-                    // ModelCatalog is text-only. Setting them to `backend`
-                    // tells the engine to enable those modalities, and
-                    // initialize() then fails with `NOT_FOUND:
-                    // TF_LITE_VISION_ENCODER not found in the model.` for
-                    // text-only bundles (FunctionGemma 270M, Gemma 3 270M,
-                    // Qwen3 0.6B, etc.). When a true multi-modal Gemma 4 E2B
-                    // bundle is added later, switch this on per-entry.
-                    val candidate = Engine(
-                        EngineConfig(
-                            modelPath = absolutePath,
-                            backend = backend,
-                            visionBackend = null,
-                            audioBackend = null,
-                            maxNumTokens = null,
-                            maxNumImages = null,
-                            cacheDir = cacheDir.absolutePath,
-                        ),
-                    )
-                    val initOk = runCatching { candidate.initialize() }
-                    if (initOk.isSuccess) {
-                        engine = candidate
-                        currentPath = absolutePath
-                        currentBackend = label
-                        Log.i(
-                            TAG,
-                            "load: ready on $label in ${System.currentTimeMillis() - t0}ms — $absolutePath",
-                        )
-                        return@withContext
-                    } else {
-                        lastError = initOk.exceptionOrNull()
-                        Log.w(
-                            TAG,
-                            "load: backend=$label failed (${lastError?.message}); trying next",
-                        )
-                        runCatching { candidate.close() }
-                    }
-                }
-                val tried = backendsToTry.joinToString(" → ") { it.first }
+    /**
+     * Same logic as [load] but assumes the caller already holds [mutex].
+     * Exists so [runCollect] can reload the engine on the next backend after
+     * an OpenCL-style runtime failure without dropping and re-acquiring the
+     * mutex (which would let another caller race in mid-recovery).
+     */
+    private suspend fun loadInternalLocked(absolutePath: String) {
+        withContext(Dispatchers.IO) {
+            engine?.let {
+                Log.i(TAG, "load: switching model — closing previous $currentPath")
+                runCatching { it.close() }
+            }
+            engine = null
+            currentPath = null
+            currentBackend = null
+
+            val cacheDir = File(context.cacheDir, "litertlm").also { it.mkdirs() }
+            val nativeLibDir = context.applicationInfo.nativeLibraryDir.orEmpty()
+            val backendsToTry = backendStrategy.candidates(nativeLibDir)
+                .filterNot { (label, _) -> label in runtimeBlockedBackends }
+
+            if (backendsToTry.isEmpty()) {
                 throw IllegalStateException(
-                    "LiteRT-LM failed on every backend ($tried). Last error: ${lastError?.message}",
-                    lastError,
+                    "LiteRT-LM has no usable backends left for this session " +
+                        "(all blocked by prior runtime failures: $runtimeBlockedBackends)",
+                )
+            }
+
+            var lastError: Throwable? = null
+            for ((label, backend) in backendsToTry) {
+                val t0 = System.currentTimeMillis()
+                Log.i(TAG, "load: trying backend=$label for $absolutePath")
+                // visionBackend / audioBackend left null: every entry in
+                // ModelCatalog is text-only. Setting them to `backend`
+                // tells the engine to enable those modalities, and
+                // initialize() then fails with `NOT_FOUND:
+                // TF_LITE_VISION_ENCODER not found in the model.` for
+                // text-only bundles (FunctionGemma 270M, Gemma 3 270M,
+                // Qwen3 0.6B, etc.). When a true multi-modal Gemma 4 E2B
+                // bundle is added later, switch this on per-entry.
+                val candidate = Engine(
+                    EngineConfig(
+                        modelPath = absolutePath,
+                        backend = backend,
+                        visionBackend = null,
+                        audioBackend = null,
+                        maxNumTokens = null,
+                        maxNumImages = null,
+                        cacheDir = cacheDir.absolutePath,
+                    ),
                 )
+                val initOk = runCatching { candidate.initialize() }
+                if (initOk.isSuccess) {
+                    engine = candidate
+                    currentPath = absolutePath
+                    currentBackend = label
+                    Log.i(
+                        TAG,
+                        "load: ready on $label in ${System.currentTimeMillis() - t0}ms — $absolutePath",
+                    )
+                    return@withContext
+                } else {
+                    lastError = initOk.exceptionOrNull()
+                    Log.w(
+                        TAG,
+                        "load: backend=$label failed (${lastError?.message}); trying next",
+                    )
+                    runCatching { candidate.close() }
+                }
             }
+            val tried = backendsToTry.joinToString(" → ") { it.first }
+            throw IllegalStateException(
+                "LiteRT-LM failed on every backend ($tried). Last error: ${lastError?.message}",
+                lastError,
+            )
         }
     }
 
@@ -176,6 +204,37 @@ class LiteRtLmSdkBridge(
     }
 
     private suspend fun runCollect(text: String, maxTokens: Int): String {
+        return try {
+            runCollectOnce(text, maxTokens)
+        } catch (t: Throwable) {
+            // Pixel 7a / Tensor G2: the GPU backend initialises fine but
+            // generation throws `Can not find OpenCL library on this device`
+            // because LiteRT-LM's Top-K sampler dlopens OpenCL even on the
+            // WebGPU path. Block this backend for the rest of the session
+            // and reload on the next one (typically CPU). We hold the
+            // mutex throughout, so no other call can race in.
+            val brokenBackend = currentBackend
+            val path = currentPath
+            if (brokenBackend != null && path != null && isRecoverableRuntimeError(t)) {
+                Log.w(
+                    TAG,
+                    "Recovering from $brokenBackend runtime failure (${t.message?.take(120)}) — " +
+                        "blocklisting and reloading on next backend",
+                )
+                runtimeBlockedBackends += brokenBackend
+                runCatching { engine?.close() }
+                engine = null
+                currentPath = null
+                currentBackend = null
+                loadInternalLocked(path)
+                runCollectOnce(text, maxTokens)
+            } else {
+                throw t
+            }
+        }
+    }
+
+    private suspend fun runCollectOnce(text: String, maxTokens: Int): String {
         val current = engine ?: error("LiteRT-LM: no model loaded")
         // maxNumTokens here is advisory — the SDK still respects the config-
         // level cap. We pass through whatever sampling the user requests.
@@ -204,6 +263,16 @@ class LiteRtLmSdkBridge(
         }
     }
 
+    /**
+     * Recoverable = the engine loaded but a runtime feature it tried to use
+     * isn't on this device. Right now the only known case is OpenCL missing
+     * on Pixel Tensor; widen as we hit more.
+     */
+    private fun isRecoverableRuntimeError(t: Throwable): Boolean {
+        val msg = (t.message ?: "").lowercase()
+        return "opencl" in msg || "open cl" in msg
+    }
+
     override suspend fun unload() {
         mutex.withLock {
             withContext(Dispatchers.IO) {

From 3f75958eb55b4efbb4684121476e433354d0c24b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dirk=20Ja=CC=88ckel?= <github@dirk.jaeckel.name>
Date: Wed, 29 Apr 2026 15:13:04 +0200
Subject: [PATCH 15/20] =?UTF-8?q?Don't=20reload+retry=20LiteRT-LM=20inline?=
 =?UTF-8?q?=20=E2=80=94=20OOMed=20Pixel=207a=20at=205.96=20GB?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previous OpenCL self-heal (b8cc107) closed the broken GPU engine and
reloaded on the next backend in the same mutex hold, then retried the
failed generate call. Engine.close() doesn't release the GPU pipeline's
native memory synchronously, so the in-flight reload of the CPU engine
held both pipelines in RAM briefly. Pixel 7a / Tensor G2 peaked at
5957197824 B (~5.96 GB), the LMK reaped the app:

  am_pss : [12767, ..., 5957197824, ...]
  killinfo: [12767, ...]
  Process com.jaeckel.urlvault (pid 12767) has died: fg TOP

User saw it as "LiteRt now crashes the app" with no FATAL exception in
logcat — classic LMK signature.

New behaviour: catch the OpenCL-style runtime failure, blocklist the
broken backend for the session, release the engine, **fail this call**.
The very next entry-point call (provider.generateXxx → bridge.load)
sees engine == null and runs a fresh load through the strategy with
the blocklist applied — single-pipeline peak, no concurrent two-engine
window. UX is one failed generation followed by a working one.
---
 .../urlvault/android/ai/LiteRtLmSdkBridge.kt  | 26 ++++++++++++-------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LiteRtLmSdkBridge.kt b/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LiteRtLmSdkBridge.kt
index f3ac8bd..4c17a85 100644
--- a/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LiteRtLmSdkBridge.kt
+++ b/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LiteRtLmSdkBridge.kt
@@ -210,27 +210,33 @@ class LiteRtLmSdkBridge(
             // Pixel 7a / Tensor G2: the GPU backend initialises fine but
             // generation throws `Can not find OpenCL library on this device`
             // because LiteRT-LM's Top-K sampler dlopens OpenCL even on the
-            // WebGPU path. Block this backend for the rest of the session
-            // and reload on the next one (typically CPU). We hold the
-            // mutex throughout, so no other call can race in.
+            // WebGPU path. Blocklist that backend so the *next* call reloads
+            // on the remaining strategy candidates (typically CPU).
+            //
+            // We deliberately do NOT reload + retry inline here. `Engine.close()`
+            // doesn't release the GPU pipeline's native memory synchronously
+            // — observed on Pixel 7a, the in-flight reload of the CPU engine
+            // briefly held both pipelines in RAM and the process peaked at
+            // ~5.96 GB, well past Pixel 7a's effective per-app budget. The
+            // LMK reaped the app and the user saw an unexplained "LiteRT
+            // crashed the app" with no FATAL exception in logcat. Bailing
+            // out here keeps peak memory at 1× model and lets the very next
+            // entry-point call (provider.generateXxx → bridge.load) start
+            // from a clean slate with the blocklist already applied.
             val brokenBackend = currentBackend
-            val path = currentPath
-            if (brokenBackend != null && path != null && isRecoverableRuntimeError(t)) {
+            if (brokenBackend != null && isRecoverableRuntimeError(t)) {
                 Log.w(
                     TAG,
                     "Recovering from $brokenBackend runtime failure (${t.message?.take(120)}) — " +
-                        "blocklisting and reloading on next backend",
+                        "blocklisting; next request will reload on remaining backends.",
                 )
                 runtimeBlockedBackends += brokenBackend
                 runCatching { engine?.close() }
                 engine = null
                 currentPath = null
                 currentBackend = null
-                loadInternalLocked(path)
-                runCollectOnce(text, maxTokens)
-            } else {
-                throw t
             }
+            throw t
         }
     }
 

From 3300669d511cf15a5b2a547c1d61aaa9943b8aab Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dirk=20Ja=CC=88ckel?= <github@dirk.jaeckel.name>
Date: Wed, 29 Apr 2026 15:22:08 +0200
Subject: [PATCH 16/20] Prefer CPU over GPU in LiteRT-LM backend strategy
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The SDK auto-selects the Top-K sampler based on engine backend; there's
no public API knob to override it. With Backend.GPU, the SDK pulls the
WebGPU/OpenCL sampler chain, which on Pixel Tensor (7a / 8 / 9) all ends
at OpenCL — including the static fallback baked into liblitertlm_jni.so.
Generation throws `Can not find OpenCL library on this device` even when
the engine itself is happily running through WebGPU.

Deprioritising GPU below CPU in the strategy makes the *first* call on
Tensor land directly on `Backend.CPU`, which uses the CPU sampler and
works. The previous order (NPU → GPU → CPU) made the first call always
fail before recovery kicked in (and earlier in this branch, recovery
itself OOMed the process at peak memory).

Cost: on devices where GPU sampling works (older Snapdragon w/ proper
OpenCL drivers), we miss the GPU speedup. Acceptable — GPU is a
hypothetical performance win for the model graph; correctness on widely-
deployed Pixel hardware is more important. The runtime self-heal in
runCollect still catches the OpenCL error if a user forces GPU via a
custom strategy.
---
 .../urlvault/android/ai/LiteRtLmSdkBridge.kt  | 23 +++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LiteRtLmSdkBridge.kt b/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LiteRtLmSdkBridge.kt
index 4c17a85..f2d8f4d 100644
--- a/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LiteRtLmSdkBridge.kt
+++ b/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LiteRtLmSdkBridge.kt
@@ -30,9 +30,24 @@ fun interface LiteRtLmBackendStrategy {
 
 /**
  * NPU first when the device's `nativeLibraryDir` is non-blank (vendor libs
- * are loaded from there for QCS / Pixel chips), then GPU, then CPU. On
- * unsupported devices the NPU init throws and `load()` falls through to
- * the next backend.
+ * are loaded from there for QCS / Pixel chips), then **CPU**, then GPU.
+ *
+ * GPU is intentionally deprioritised below CPU. The LiteRT-LM SDK
+ * auto-selects the Top-K sampler based on engine backend: GPU engine
+ * pulls in the WebGPU / OpenCL sampler chain, which on Pixel Tensor (7a /
+ * 8 / 9) ends with `Can not find OpenCL library on this device` — the
+ * fallback is statically-linked OpenCL too. There is no public API knob
+ * to use the CPU sampler with a GPU engine. Using `Backend.CPU` for the
+ * engine forces the CPU sampler and side-steps the issue entirely.
+ *
+ * Cost: on devices where GPU sampling DOES work (older Snapdragon with
+ * full OpenCL drivers), we'd miss the GPU speedup. Acceptable, given
+ * "always works on CPU" beats "fast on some devices, broken on Tensor".
+ *
+ * The runtime self-heal in [LiteRtLmSdkBridge.runCollect] still catches
+ * the OpenCL error if it ever fires (e.g. someone explicitly forces a GPU
+ * strategy) and blocklists the broken backend so subsequent calls try
+ * the next candidate.
  */
 object DefaultBackendStrategy : LiteRtLmBackendStrategy {
     override fun candidates(nativeLibDir: String): List<Pair<String, Backend>> {
@@ -40,9 +55,9 @@ object DefaultBackendStrategy : LiteRtLmBackendStrategy {
         if (nativeLibDir.isNotBlank()) {
             list.add("NPU" to Backend.NPU(nativeLibDir))
         }
-        list.add("GPU" to Backend.GPU())
         // null = default thread count picked by the runtime.
         list.add("CPU" to Backend.CPU(null))
+        list.add("GPU" to Backend.GPU())
         return list
     }
 }

From 49a004e4e2772b94ecd8df22a500eea51631e87f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dirk=20Ja=CC=88ckel?= <github@dirk.jaeckel.name>
Date: Wed, 29 Apr 2026 17:23:52 +0200
Subject: [PATCH 17/20] =?UTF-8?q?LiteRT=20debug=20tag=20now=20shows=20back?=
 =?UTF-8?q?end=20label;=20restore=20NPU=E2=86=92GPU=E2=86=92CPU=20strategy?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two changes after running on a Pixel 10 Pro Fold and seeing 70 s tag
generation, with no easy way to tell whether GPU acceleration was even
in play:

1. Restore the original NPU → GPU → CPU strategy. The earlier reorder
   (3300669) was an over-correction for the Pixel 7a OpenCL bug. With
   the OOM-free recovery now in place (3f75958, blocklist + throw, no
   inline retry) the trade-off goes the other way: G2 takes one failed
   first call before settling on CPU; G5 / 10 Pro Fold lands on GPU
   directly and gets the speedup. Always-CPU was strictly worse for
   the device class with working OpenCL/WebGPU.

2. Surface the actual loaded backend in the debug provenance tag for
   LiteRT-LM. The saved bookmark now carries
     liteRt[GPU]:gemma-3-1b-it-int4:2.34s
   instead of
     liteRt:gemma-3-1b-it-int4:70.20s
   so "is this NPU/GPU/CPU?" is answerable at a glance, no logcat
   needed. AICore / llama.cpp / Leap don't expose a comparable
   backend distinction in this app, so the suffix only fires for
   LiteRT.

Plumbing: LiteRtLmNativeBridge gains `currentBackendLabel()` (default
null), LiteRtLmSdkBridge returns its `currentBackend` field,
LiteRtLmModelProvider exposes a public method that calls into the
bridge, and the router casts to LiteRtLmModelProvider to read it when
building the tag.
---
 .../android/ai/LiteRtLmModelProvider.kt       |  9 ++++++
 .../android/ai/LiteRtLmNativeBridge.kt        |  8 +++++
 .../urlvault/android/ai/LiteRtLmSdkBridge.kt  | 32 ++++++++-----------
 .../urlvault/android/ai/LocalModelRouter.kt   | 11 ++++++-
 4 files changed, 41 insertions(+), 19 deletions(-)

diff --git a/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LiteRtLmModelProvider.kt b/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LiteRtLmModelProvider.kt
index 77a1175..83933ae 100644
--- a/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LiteRtLmModelProvider.kt
+++ b/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LiteRtLmModelProvider.kt
@@ -40,6 +40,15 @@ class LiteRtLmModelProvider(
 
     override suspend fun isReady(): Boolean = bridge.isAvailable()
 
+    /**
+     * Backend the SDK ended up loading on (`"NPU"` / `"GPU"` / `"CPU"`),
+     * or null if no model is loaded yet. Read by `LocalModelRouter` to
+     * enrich the debug provenance tag — the saved bookmark then carries
+     * `liteRt[GPU]:gemma-3-1b-it-int4:2.34s` so it's obvious at a glance
+     * whether NPU/GPU acceleration was actually in play.
+     */
+    fun currentBackendLabel(): String? = bridge.currentBackendLabel()
+
     override suspend fun preload() {
         // Same mutex as the generate path so an inference call can't race a
         // warm-up into the LiteRT-LM Engine constructor.
diff --git a/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LiteRtLmNativeBridge.kt b/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LiteRtLmNativeBridge.kt
index 8ee427e..2e4e8eb 100644
--- a/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LiteRtLmNativeBridge.kt
+++ b/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LiteRtLmNativeBridge.kt
@@ -18,6 +18,14 @@ interface LiteRtLmNativeBridge {
     /** Whether LiteRT-LM loaded successfully and the device can run inference. */
     fun isAvailable(): Boolean
 
+    /**
+     * Label for the currently loaded backend (`"NPU"` / `"GPU"` / `"CPU"`),
+     * or null if no model is loaded. Surfaced in the debug provenance tag
+     * so the saved bookmark answers "did it run on NPU/GPU/CPU?" at a
+     * glance, without having to dig through logcat.
+     */
+    fun currentBackendLabel(): String? = null
+
     /**
      * Loads the `.litertlm` bundle at [absolutePath] into memory. Idempotent
      * per path: a repeated call with the same path is a no-op; a different
diff --git a/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LiteRtLmSdkBridge.kt b/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LiteRtLmSdkBridge.kt
index f2d8f4d..e862747 100644
--- a/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LiteRtLmSdkBridge.kt
+++ b/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LiteRtLmSdkBridge.kt
@@ -30,24 +30,18 @@ fun interface LiteRtLmBackendStrategy {
 
 /**
  * NPU first when the device's `nativeLibraryDir` is non-blank (vendor libs
- * are loaded from there for QCS / Pixel chips), then **CPU**, then GPU.
+ * are loaded from there for QCS / Pixel chips), then GPU, then CPU.
  *
- * GPU is intentionally deprioritised below CPU. The LiteRT-LM SDK
- * auto-selects the Top-K sampler based on engine backend: GPU engine
- * pulls in the WebGPU / OpenCL sampler chain, which on Pixel Tensor (7a /
- * 8 / 9) ends with `Can not find OpenCL library on this device` — the
- * fallback is statically-linked OpenCL too. There is no public API knob
- * to use the CPU sampler with a GPU engine. Using `Backend.CPU` for the
- * engine forces the CPU sampler and side-steps the issue entirely.
- *
- * Cost: on devices where GPU sampling DOES work (older Snapdragon with
- * full OpenCL drivers), we'd miss the GPU speedup. Acceptable, given
- * "always works on CPU" beats "fast on some devices, broken on Tensor".
- *
- * The runtime self-heal in [LiteRtLmSdkBridge.runCollect] still catches
- * the OpenCL error if it ever fires (e.g. someone explicitly forces a GPU
- * strategy) and blocklists the broken backend so subsequent calls try
- * the next candidate.
+ * GPU sometimes works (newer Tensor / Snapdragon with proper OpenCL or
+ * WebGPU drivers — e.g. Pixel 10 Pro Fold) and sometimes doesn't (Pixel
+ * 7a / Tensor G2 throws `Can not find OpenCL library on this device` once
+ * generation starts, because the SDK auto-selects the GPU sampler from
+ * the engine backend and there is no public knob to override). The
+ * runtime self-heal in [LiteRtLmSdkBridge.runCollect] catches that
+ * failure, blocklists the broken backend for the rest of the session,
+ * and lets the *next* call reload on the remaining candidates (typically
+ * CPU). The cost is one failed first call on devices where GPU breaks;
+ * acceptable in exchange for keeping GPU acceleration where it works.
  */
 object DefaultBackendStrategy : LiteRtLmBackendStrategy {
     override fun candidates(nativeLibDir: String): List<Pair<String, Backend>> {
@@ -55,9 +49,9 @@ object DefaultBackendStrategy : LiteRtLmBackendStrategy {
         if (nativeLibDir.isNotBlank()) {
             list.add("NPU" to Backend.NPU(nativeLibDir))
         }
+        list.add("GPU" to Backend.GPU())
         // null = default thread count picked by the runtime.
         list.add("CPU" to Backend.CPU(null))
-        list.add("GPU" to Backend.GPU())
         return list
     }
 }
@@ -108,6 +102,8 @@ class LiteRtLmSdkBridge(
 
     override fun isAvailable(): Boolean = classLoaderProbe
 
+    override fun currentBackendLabel(): String? = currentBackend
+
     override suspend fun load(absolutePath: String) {
         mutex.withLock {
             if (currentPath == absolutePath && engine != null) {
diff --git a/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LocalModelRouter.kt b/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LocalModelRouter.kt
index 1079549..ce2f491 100644
--- a/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LocalModelRouter.kt
+++ b/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LocalModelRouter.kt
@@ -212,6 +212,15 @@ class LocalModelRouter(
             ModelRuntime.LEAP -> "leap"
             ModelRuntime.MEDIAPIPE -> "liteRt"
         }
+        // For LiteRT-LM, append the backend label the SDK actually picked
+        // (NPU/GPU/CPU) so the saved bookmark answers "did acceleration
+        // engage?" without having to grep logcat. The other runtimes don't
+        // expose a comparable concept (AICore is system-managed, llama.cpp
+        // and Leap are CPU-only here), so the suffix only fires for LiteRT.
+        val backendSuffix = (provider as? LiteRtLmModelProvider)
+            ?.currentBackendLabel()
+            ?.let { "[$it]" }
+            .orEmpty()
         // provider.id is `<runtime-prefix>:<model-id>` (e.g.
         // `leap:lfm2-1.2b-extract`); strip the prefix so we can substitute
         // the shorter SDK name without duplicating the runtime label.
@@ -226,7 +235,7 @@ class LocalModelRouter(
             val padded = if (hundredths < 10) "0$hundredths" else "$hundredths"
             "$whole.${padded}s"
         }
-        return "$sdk:$model:$duration"
+        return "$sdk$backendSuffix:$model:$duration"
     }
 
     suspend fun generateDescription(url: String, title: String): Result<String> {

From 64610a947ea0cda25619c4d387115e6f94fa77a4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dirk=20Ja=CC=88ckel?= <github@dirk.jaeckel.name>
Date: Wed, 29 Apr 2026 17:47:31 +0200
Subject: [PATCH 18/20] Mode-aware dedup so AI doesn't fire twice when toggle
 flips on
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Subsequent shares of the same URL produced two debug provenance tags
on the saved bookmark, with different durations — proof of two real
generation runs, not a duplicate state callback.

Cause: the share-intent LaunchedEffect was keyed on
`(prefilledUrl, aiCoreEnabled)` and called `triggerAiForUrl(force=true)`.
The `force=true` was needed for the startup race (legacy first, AI
second when the readiness probe settled) but it bypassed the URL
dedup *unconditionally* — any unrelated recomposition that flipped
`aiCoreEnabled` (or any other state Compose decided to invalidate the
effect on) re-fired the whole AI flow. Description ran twice → each
description-success chained tag generation → two `router.generateTags`
calls → two debug tags.

Fix: track the *mode* the URL was last triggered in (`"ai"` /
`"legacy"`) alongside the URL itself, and dedup on the pair. Same URL
+ same mode = no-op. Legacy → AI is a real mode change and falls
through (preserves the startup-race fix). The share-intent effect
drops `force = true`; the dedup now does the right thing on its own.

Other call sites that explicitly clear `aiTriggeredForUrl = null` to
allow a re-trigger continue to work — null != targetUrl short-circuits
the first AND clause, so the mode comparison never gates them.
---
 .../urlvault/ui/AddEditBookmarkScreen.kt      | 35 +++++++++++++------
 1 file changed, 25 insertions(+), 10 deletions(-)

diff --git a/shared/src/commonMain/kotlin/com/jaeckel/urlvault/ui/AddEditBookmarkScreen.kt b/shared/src/commonMain/kotlin/com/jaeckel/urlvault/ui/AddEditBookmarkScreen.kt
index 2edb34b..bec8c1e 100644
--- a/shared/src/commonMain/kotlin/com/jaeckel/urlvault/ui/AddEditBookmarkScreen.kt
+++ b/shared/src/commonMain/kotlin/com/jaeckel/urlvault/ui/AddEditBookmarkScreen.kt
@@ -96,8 +96,15 @@ fun AddEditBookmarkScreen(
 
     val TAG = "AddEditBookmarkScreen"
 
-    // Track which URL we've already triggered AI for, to prevent re-triggering
+    // Track which URL we've already triggered AI for, to prevent re-triggering.
+    // The mode component matters because the share-intent LaunchedEffect
+    // re-keys on `aiCoreEnabled`: when the AI master toggle flips on after
+    // a startup race, we *want* to re-trigger (legacy → AI), but only that
+    // once. Without the mode check, a `force = true` would re-fire even on
+    // unrelated recompositions, producing duplicate description / tags
+    // generations (and two debug provenance tags in the saved bookmark).
     var aiTriggeredForUrl by remember { mutableStateOf<String?>(null) }
+    var aiTriggeredMode by remember { mutableStateOf<String?>(null) }
 
     // Helper to normalize and validate URL for AI triggering
     fun normalizeUrlForAi(rawUrl: String): String? {
@@ -112,12 +119,18 @@ fun AddEditBookmarkScreen(
 
     // Helper to trigger AI/autotag for a given URL
     fun triggerAiForUrl(targetUrl: String, force: Boolean = false) {
-        Logger.d(TAG, "triggerAiForUrl($targetUrl, force=$force)")
-        if (!force && aiTriggeredForUrl == targetUrl) {
-            Logger.d(TAG, "Already triggered for $targetUrl")
+        val desiredMode = if (aiCoreEnabled) "ai" else "legacy"
+        Logger.d(TAG, "triggerAiForUrl($targetUrl, force=$force, mode=$desiredMode)")
+        // Dedup on (URL, mode). Same URL + same mode is a no-op so unrelated
+        // recompositions don't re-fire the AI flow. Same URL + different mode
+        // (legacy → AI when the master toggle flips on after the startup
+        // race) IS a legitimate retrigger and falls through.
+        if (!force && aiTriggeredForUrl == targetUrl && aiTriggeredMode == desiredMode) {
+            Logger.d(TAG, "Already triggered for $targetUrl in $desiredMode mode")
             return
         }
         aiTriggeredForUrl = targetUrl
+        aiTriggeredMode = desiredMode
 
         // If AI is available and enabled, use it for title/desc/tags
         if (aiCoreEnabled) {
@@ -275,19 +288,21 @@ fun AddEditBookmarkScreen(
     }
 
     // Auto-trigger for prefilled URLs (share intent). Keyed on
-    // `aiCoreEnabled` as well as `prefilledUrl` so a startup race —
+    // `aiCoreEnabled` as well as `prefilledUrl` so the startup race —
     // share intent fires before `anyProviderReady`'s async readiness
     // probe has finished, so `aiCoreEnabled` is briefly false and the
     // first trigger ends up on the legacy branch — gets corrected once
-    // AI flips on. `force = true` so the re-trigger is not deduped by
-    // `aiTriggeredForUrl`. The legacy LaunchedEffects' `if (!aiCoreEnabled)`
-    // guards already prevent stale legacy results from clobbering the AI
-    // values when this flip happens.
+    // AI flips on. `triggerAiForUrl`'s mode-aware dedup handles both
+    // cases cleanly: legacy → AI is a real mode change so it re-fires;
+    // a stable-true aiCoreEnabled across recompositions is the same
+    // mode and is deduped. The legacy result-handling LaunchedEffects'
+    // `if (!aiCoreEnabled)` guards already prevent stale legacy results
+    // from clobbering the AI values when this flip happens.
     LaunchedEffect(prefilledUrl, aiCoreEnabled) {
         if (!isEditMode && prefilledUrl != null) {
             val targetUrl = normalizeUrlForAi(prefilledUrl)
             if (targetUrl != null) {
-                triggerAiForUrl(targetUrl, force = true)
+                triggerAiForUrl(targetUrl)
             }
         }
     }

From 9e4422422a0900ec1d95812a6e6eee2ce0f82d4f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dirk=20Ja=CC=88ckel?= <github@dirk.jaeckel.name>
Date: Wed, 29 Apr 2026 17:52:16 +0200
Subject: [PATCH 19/20] Re-deprioritise GPU below CPU; OpenCL bug confirmed on
 Tensor G5 too
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Earlier (3300669 → 49a004e) I bounced this back and forth: deprioritised
GPU after the Pixel 7a / Tensor G2 OpenCL failure, then re-prioritised
it on the assumption that newer Tensor (G5 / Pixel 10 Pro Fold) might
have proper OpenCL drivers. Turns out it doesn't — the user reports the
same `Can not find OpenCL library on this device` on the Fold, so the
LiteRT-LM 0.10.x GPU sampler is broken on every Pixel Tensor we've
tested.

Final strategy (and the right one until/unless we get a working-OpenCL
device to test on): NPU → CPU → GPU. NPU rarely works (vendor libs
aren't packaged in the LiteRT-LM AAR for any Pixel we've seen) and
falls through to CPU, which has a CPU sampler and works correctly.
GPU stays in the list as a last resort for hypothetical future devices,
gated behind the runtime self-heal that blocklists it on the OpenCL
error if anyone forces GPU via a custom strategy.

Saves ~5–10 s of cold-start time on every first generation on the
user's two Pixel devices.
---
 .../urlvault/android/ai/LiteRtLmSdkBridge.kt  | 29 +++++++++++--------
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LiteRtLmSdkBridge.kt b/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LiteRtLmSdkBridge.kt
index e862747..9235376 100644
--- a/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LiteRtLmSdkBridge.kt
+++ b/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LiteRtLmSdkBridge.kt
@@ -30,18 +30,23 @@ fun interface LiteRtLmBackendStrategy {
 
 /**
  * NPU first when the device's `nativeLibraryDir` is non-blank (vendor libs
- * are loaded from there for QCS / Pixel chips), then GPU, then CPU.
+ * are loaded from there for QCS / Pixel chips), then **CPU**, then GPU.
  *
- * GPU sometimes works (newer Tensor / Snapdragon with proper OpenCL or
- * WebGPU drivers — e.g. Pixel 10 Pro Fold) and sometimes doesn't (Pixel
- * 7a / Tensor G2 throws `Can not find OpenCL library on this device` once
- * generation starts, because the SDK auto-selects the GPU sampler from
- * the engine backend and there is no public knob to override). The
- * runtime self-heal in [LiteRtLmSdkBridge.runCollect] catches that
- * failure, blocklists the broken backend for the rest of the session,
- * and lets the *next* call reload on the remaining candidates (typically
- * CPU). The cost is one failed first call on devices where GPU breaks;
- * acceptable in exchange for keeping GPU acceleration where it works.
+ * GPU is intentionally last. On every Pixel Tensor we've tested (G2 on
+ * Pixel 7a, G5 on Pixel 10 Pro Fold) the GPU engine loads but the first
+ * generate call throws `Can not find OpenCL library on this device` —
+ * LiteRT-LM 0.10.x auto-selects an OpenCL Top-K sampler from the engine
+ * backend and Tensor doesn't ship OpenCL drivers. The SDK has no public
+ * knob to use the CPU sampler with a GPU engine, so on Tensor the only
+ * way to get a working sampler is to run the engine on CPU too. Putting
+ * CPU before GPU avoids a wasted ~5–10 s GPU load + failed generate
+ * cycle on every cold start on those devices.
+ *
+ * Cost: on a hypothetical device with working OpenCL drivers we'd miss
+ * the GPU speedup. We don't currently have such a test device and the
+ * "correct on Tensor" trade is much more important. The runtime
+ * self-heal in [LiteRtLmSdkBridge.runCollect] still catches the OpenCL
+ * error if a custom strategy puts GPU first.
  */
 object DefaultBackendStrategy : LiteRtLmBackendStrategy {
     override fun candidates(nativeLibDir: String): List<Pair<String, Backend>> {
@@ -49,9 +54,9 @@ object DefaultBackendStrategy : LiteRtLmBackendStrategy {
         if (nativeLibDir.isNotBlank()) {
             list.add("NPU" to Backend.NPU(nativeLibDir))
         }
-        list.add("GPU" to Backend.GPU())
         // null = default thread count picked by the runtime.
         list.add("CPU" to Backend.CPU(null))
+        list.add("GPU" to Backend.GPU())
         return list
     }
 }

From e58a524fae7c5ef9b3fd0c4a88e5b349180f5281 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dirk=20Ja=CC=88ckel?= <github@dirk.jaeckel.name>
Date: Wed, 29 Apr 2026 19:06:28 +0200
Subject: [PATCH 20/20] Skip LLM for description when the page already provides
 one
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Every provider's generateDescription previously called into the model
unconditionally. But for any page with `og:description` or
`<meta name="description">` (i.e. most pages worth bookmarking), the
publisher already wrote a 1-2 sentence summary tuned for social-card /
SERP display — the LLM can't beat that, and asking it to "rewrite" the
existing string is wasted work that often degrades the result.

Mirror the same short-circuit `generateTitle` already uses for `<title>`
across all four providers (AICore, llama.cpp, LEAP, LiteRT-LM):

  if pageContent has og:description or meta description:
      return it verbatim (after URL stripping + length truncation)
  else:
      LLM as before, with `pageContent.visibleText` as the source

Particularly important for LEAP: LFM2-Extract is fine-tuned for
extraction, not generation. Asking it to summarise an already-good
summary often produced degenerate output (`{"description":":\",\",..."}`)
because the grammar's `minLength: 1` cornering forced *something* even
when the model had nothing to add. With this change, LFM2-Extract only
fires for pages where extraction is genuinely needed (and from
`visibleText` — the actual long-form body — rather than a pre-written
summary).

The fall-through prompts now describe their input as "Page text:"
instead of "Page summary:" since visibleText is body text, not a
summary.

Side effects:
- ~70 s LLM calls on Tensor CPU collapse to <100 ms native lookups for
  the 80%+ of pages with metadata.
- The debug provenance tag (e.g. `leap:lfm2-1.2b-extract:N.NNs`) only
  fires on the LLM path now — its absence from a saved bookmark
  signals the description came from the page itself.
- The `looksDegenerate` LEAP rejection path stays in place for the
  visible-text-only case where the failure mode can still surface.
---
 .../urlvault/android/ai/AICoreService.kt      | 24 +++++++++++++------
 .../urlvault/android/ai/LeapModelProvider.kt  | 19 ++++++++++++++-
 .../android/ai/LiteRtLmModelProvider.kt       | 15 +++++++++++-
 .../android/ai/LlamaCppModelProvider.kt       | 15 ++++++++++--
 4 files changed, 62 insertions(+), 11 deletions(-)

diff --git a/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/AICoreService.kt b/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/AICoreService.kt
index 8a974f5..0b4dc6f 100644
--- a/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/AICoreService.kt
+++ b/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/AICoreService.kt
@@ -288,13 +288,26 @@ class AICoreService(httpClient: HttpClient) {
 
     /**
      * Generate a 1-2 sentence description for a bookmark.
-     * Fetches the web page to provide context for an accurate description.
+     *
+     * Same shape as [generateTitle]: if the page itself carries a
+     * publisher-written summary (`<meta property="og:description">` or
+     * `<meta name="description">`), return it verbatim — the LLM can't beat
+     * what the author wrote about their own page, and burning a Gemini Nano
+     * call to "rewrite" an existing 1-2 sentence summary is wasted work
+     * that often degrades the result. The LLM only fires for pages with no
+     * metadata-provided description, where genuine extraction from
+     * `visibleText` is needed.
      */
     suspend fun generateDescription(url: String, title: String): Result<String> {
         return runCatching {
             val pageContent = fetchPageContent(url)
-            val pageSummary = pageContent?.bestSummary(MAX_PAGE_CONTENT_LENGTH) ?: ""
 
+            val nativeDesc = pageContent?.let { it.ogDescription ?: it.metaDescription }
+            if (!nativeDesc.isNullOrBlank()) {
+                return@runCatching validateDescription(nativeDesc.trim())
+            }
+
+            val pageSummary = pageContent?.visibleText.orEmpty().take(MAX_PAGE_CONTENT_LENGTH)
             val prompt = buildString {
                 appendLine("Write a 1-2 sentence factual description for this bookmark.")
                 appendLine("Return ONLY the description, nothing else.")
@@ -305,15 +318,12 @@ class AICoreService(httpClient: HttpClient) {
                     appendLine("Title: $title")
                 }
                 if (pageSummary.isNotBlank()) {
-                    appendLine("Page summary: $pageSummary")
+                    appendLine("Page text: $pageSummary")
                 } else {
                     appendLine("If you cannot determine what the page is about, respond with: Unable to generate description.")
                 }
             }
-            
-            // See generateTags() — inline runBenchmarking removed for the
-            // same reason; explicit comparison lives in
-            // ModelComparisonScreen.
+
             validateDescription(runInference(prompt).trim())
         }
     }
diff --git a/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LeapModelProvider.kt b/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LeapModelProvider.kt
index 7520340..bda1bd9 100644
--- a/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LeapModelProvider.kt
+++ b/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LeapModelProvider.kt
@@ -114,7 +114,24 @@ class LeapModelProvider(
 
     override suspend fun generateDescription(url: String, title: String): Result<String> = runCatching {
         val pageContent = runCatching { contentExtractor.extract(url) }.getOrNull()
-        val pageSummary = pageContent?.bestSummary(MAX_PAGE_CONTENT_LENGTH).orEmpty()
+
+        // Short-circuit on a page-provided description — same shape as
+        // generateTitle. Two reasons this matters specifically for LFM2-
+        // Extract:
+        //  - it's an *extraction* fine-tune, not a generation one. Asking
+        //    it to rewrite an already-good summary just wastes a model
+        //    call;
+        //  - on pages where the supplied text has nothing extractable, the
+        //    grammar's `minLength: 1` cornering produces degenerate
+        //    sequences like `:","",..."` (see `looksDegenerate`). Skipping
+        //    the LLM entirely when a usable description is already
+        //    available eliminates that failure mode for those pages.
+        val nativeDesc = pageContent?.let { it.ogDescription ?: it.metaDescription }
+        if (!nativeDesc.isNullOrBlank()) {
+            return@runCatching validateDescription(nativeDesc.trim())
+        }
+
+        val pageSummary = pageContent?.visibleText.orEmpty().take(MAX_PAGE_CONTENT_LENGTH)
 
         val schema = """
             {
diff --git a/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LiteRtLmModelProvider.kt b/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LiteRtLmModelProvider.kt
index 83933ae..f5fd9c0 100644
--- a/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LiteRtLmModelProvider.kt
+++ b/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LiteRtLmModelProvider.kt
@@ -124,7 +124,20 @@ class LiteRtLmModelProvider(
 
     override suspend fun generateDescription(url: String, title: String): Result<String> = runCatching {
         val pageContent = runCatching { contentExtractor.extract(url) }.getOrNull()
-        val pageSummary = pageContent?.bestSummary(MAX_PAGE_CONTENT_LENGTH).orEmpty()
+
+        // Short-circuit on a page-provided description — same shape as
+        // generateTitle. Most pages carry a publisher-written
+        // og:description / <meta name="description"> already optimised for
+        // social-card / SERP display; the LLM rewrite is wasted work and
+        // on Tensor CPU here it costs ~1–5 seconds per call. Skip
+        // straight to it. The model fires only when the page has no
+        // metadata-provided description.
+        val nativeDesc = pageContent?.let { it.ogDescription ?: it.metaDescription }
+        if (!nativeDesc.isNullOrBlank()) {
+            return@runCatching validateDescription(nativeDesc.trim())
+        }
+
+        val pageSummary = pageContent?.visibleText.orEmpty().take(MAX_PAGE_CONTENT_LENGTH)
 
         val example = """{"description": "A Kotlin Multiplatform tutorial covering shared UI with Compose."}"""
 
diff --git a/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LlamaCppModelProvider.kt b/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LlamaCppModelProvider.kt
index 7d1d4af..32fbd62 100644
--- a/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LlamaCppModelProvider.kt
+++ b/androidApp/src/main/kotlin/com/jaeckel/urlvault/android/ai/LlamaCppModelProvider.kt
@@ -77,8 +77,19 @@ class LlamaCppModelProvider(
 
     override suspend fun generateDescription(url: String, title: String): Result<String> = runCatching {
         val pageContent = runCatching { contentExtractor.extract(url) }.getOrNull()
-        val pageSummary = pageContent?.bestSummary(MAX_PAGE_CONTENT_LENGTH).orEmpty()
 
+        // Same short-circuit as generateTitle: prefer the publisher's own
+        // description (og:description / <meta name="description">) over a
+        // model rewrite. The GGUF model is most useful when the page has
+        // *no* metadata-provided summary; otherwise we just spend several
+        // seconds rewriting a 1-2 sentence string into a slightly worse
+        // 1-2 sentence string.
+        val nativeDesc = pageContent?.let { it.ogDescription ?: it.metaDescription }
+        if (!nativeDesc.isNullOrBlank()) {
+            return@runCatching validateDescription(nativeDesc.trim())
+        }
+
+        val pageSummary = pageContent?.visibleText.orEmpty().take(MAX_PAGE_CONTENT_LENGTH)
         val prompt = buildString {
             appendLine("Write a 1-2 sentence factual description for this bookmark.")
             appendLine("Return ONLY the description, nothing else.")
@@ -87,7 +98,7 @@ class LlamaCppModelProvider(
             appendLine("URL: $url")
             if (title.isNotBlank()) appendLine("Title: $title")
             if (pageSummary.isNotBlank()) {
-                appendLine("Page summary: $pageSummary")
+                appendLine("Page text: $pageSummary")
             } else {
                 appendLine("If you cannot determine what the page is about, respond with: Unable to generate description.")
             }