From 3828dff78785fab207515a8f3e864d1c2df89712 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Fri, 29 May 2026 01:19:00 +0000
Subject: [PATCH 01/12] Add cold-start model-loading dialog to chat

When the first token doesn't arrive within 5s of sending, surface a
small non-blocking dialog explaining the model is loading into memory,
with a Cancel button. The dialog tears down on the first frame (delta,
tool call, or terminal event) and isn't re-armed for warm later turns.
---
 src/ui/Chat.svelte | 132 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 132 insertions(+)
diff --git a/src/ui/Chat.svelte b/src/ui/Chat.svelte
index afd4856..b4dd2f1 100644
--- a/src/ui/Chat.svelte
+++ b/src/ui/Chat.svelte
@@ -120,6 +120,28 @@
   let input = $state("");
   let streaming = $state(false);
 
+  /** Cold-start UX: Ollama loads a model's weights into RAM/VRAM on
+   *  the first request after it was evicted (or never loaded this
+   *  session). That gap — between firing the chat stream and the
+   *  first token coming back — is silent today. If it runs long we
+   *  surface a small "loading the model" dialog so the user knows the
+   *  app isn't wedged. Once any frame arrives (delta, tool call, or
+   *  terminal event) the model is resident and we tear the dialog
+   *  down; we don't re-arm it for later turns in the same run since
+   *  the model is warm by then. */
+  const MODEL_LOAD_POPUP_DELAY_MS = 5000;
+  let modelLoading = $state(false);
+  let modelLoadTimer: ReturnType<typeof setTimeout> | null = null;
+  /** Clear the load-wait dialog + its arming timer. Idempotent, so
+   *  it's safe to call from every agent event and from cleanup. */
+  function clearModelLoadWait() {
+    if (modelLoadTimer !== null) {
+      clearTimeout(modelLoadTimer);
+      modelLoadTimer = null;
+    }
+    if (modelLoading) modelLoading = false;
+  }
+
   /** One pending attachment staged for the next send. Images become
    *  Ollama-style `images: [base64]` array entries on the user
    *  message; text-like files (JSON, configs, source, plain text)
@@ -838,6 +860,13 @@
       enabledToolSet.has(t.definition.function.name),
     );
 
+    // Arm the cold-start dialog: if no frame has come back by the
+    // delay, the model is (re)loading into memory and we tell the
+    // user so. clearModelLoadWait() below disarms it on first frame.
+    modelLoadTimer = setTimeout(() => {
+      modelLoading = true;
+    }, MODEL_LOAD_POPUP_DELAY_MS);
+
     try {
       await runAgent({
         messages: working,
@@ -849,6 +878,9 @@
         viaDevicePubkey: routeViaDevicePubkey,
         signal: controller.signal,
         onEvent: (event: AgentEvent) => {
+          // Any frame means the model is resident and producing —
+          // tear down the load-wait dialog (idempotent).
+          clearModelLoadWait();
           switch (event.kind) {
             case "assistant_delta":
             case "thinking_delta": {
@@ -916,6 +948,9 @@
       streaming = false;
       agentAbortController = null;
       inFlightToolCallIds = new Set();
+      // Belt-and-suspenders: if the run ended before any frame (error
+      // thrown, instant cancel), the timer/dialog could still be live.
+      clearModelLoadWait();
       // Drop the streaming flag on any straggler bubble so its
       // <details> can collapse cleanly once the answer is in.
       if (liveIdx !== -1 && liveIdx < messages.length) {
@@ -958,6 +993,9 @@
     // `infer_cancel` (mesh path) on whatever turn is in flight, then
     // unwinds the loop without starting another round.
     agentAbortController?.abort();
+    // Drop the load-wait dialog right away rather than waiting for the
+    // stream to unwind through the finally block.
+    clearModelLoadWait();
   }
 
   function onKeydown(e: KeyboardEvent) {
@@ -1392,6 +1430,28 @@
   {/if}
   </div>
 
+  {#if modelLoading}
+    <!-- Cold-start dialog. Shown only when the first token hasn't
+         arrived within MODEL_LOAD_POPUP_DELAY_MS — i.e. the model is
+         (re)loading into memory. Non-blocking: the user can keep
+         reading the transcript behind it, and Cancel aborts the run. -->
+    <div class="model-loading-backdrop" role="dialog" aria-modal="false" aria-live="polite">
+      <div class="model-loading-card">
+        <div class="spinner" aria-hidden="true"></div>
+        <div class="model-loading-text">
+          {#if routeViaDevicePubkey}
+            <p class="model-loading-title">Waiting for {routedPeer?.label ?? "the host"}…</p>
+            <p class="model-loading-sub">The host is loading its model. This can take a few seconds.</p>
+          {:else}
+            <p class="model-loading-title">Loading {activeModel}…</p>
+            <p class="model-loading-sub">First use reads the model into memory. This is usually a one-time wait — later replies start instantly.</p>
+          {/if}
+        </div>
+        <button class="model-loading-cancel" onclick={stop}>Cancel</button>
+      </div>
+    </div>
+  {/if}
+
   {#if settingsTab}
     <SettingsPanel
       initialTab={settingsTab}
@@ -1416,6 +1476,78 @@
     flex-direction: column;
     position: relative;
   }
+
+  /* Cold-start model-loading dialog. Floats over the chat surface
+     without blocking it (pointer-events scoped to the card). */
+  .model-loading-backdrop {
+    position: absolute;
+    inset: 0;
+    display: flex;
+    align-items: center;
+    justify-content: center;
+    background: rgba(0, 0, 0, 0.45);
+    z-index: 40;
+    pointer-events: none;
+    animation: model-loading-fade 0.18s ease-out;
+  }
+  @keyframes model-loading-fade {
+    from { opacity: 0; }
+    to { opacity: 1; }
+  }
+  .model-loading-card {
+    pointer-events: auto;
+    display: flex;
+    align-items: center;
+    gap: 0.9rem;
+    max-width: 24rem;
+    padding: 1.1rem 1.25rem;
+    background: #181818;
+    border: 1px solid #2a2a2a;
+    border-radius: 12px;
+    box-shadow: 0 12px 40px rgba(0, 0, 0, 0.5);
+  }
+  .model-loading-card .spinner {
+    flex: none;
+    width: 24px;
+    height: 24px;
+    border: 3px solid #333;
+    border-top-color: #6e6ef7;
+    border-radius: 50%;
+    animation: spin 0.8s linear infinite;
+  }
+  @keyframes spin {
+    to { transform: rotate(360deg); }
+  }
+  .model-loading-text {
+    min-width: 0;
+  }
+  .model-loading-title {
+    margin: 0;
+    color: #e8e8e8;
+    font-size: 0.95rem;
+    font-weight: 600;
+  }
+  .model-loading-sub {
+    margin: 0.25rem 0 0;
+    color: #999;
+    font-size: 0.8rem;
+    line-height: 1.35;
+  }
+  .model-loading-cancel {
+    flex: none;
+    align-self: flex-start;
+    background: none;
+    border: 1px solid #2a2a2a;
+    border-radius: 6px;
+    color: #bbb;
+    padding: 0.3rem 0.6rem;
+    font-size: 0.8rem;
+    cursor: pointer;
+  }
+  .model-loading-cancel:hover {
+    border-color: #3a3a55;
+    color: #ddd;
+  }
   .chat-body {
     flex: 1;
     min-height: 0;

From 9495d8eaa74935e656b4acce681f36cc69956a6e Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Fri, 29 May 2026 04:23:51 +0000
Subject: [PATCH 02/12] Add configurable keep_alive + live resource readout on
 load dialog
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two improvements to the model-loading experience:

1. Cold-start fix: chat requests now send Ollama a keep_alive so the
   model stays resident between turns instead of relying on Ollama's
   5-minute default (which caused repeated cold-start reloads). The
   value is user-configurable in Settings > Hardware > Performance,
   defaulting to 30m, with options from 'unload immediately' (for
   memory-tight machines coexisting with transcription) to 'keep
   until the app quits'. Read in Rust from config so both the
   streaming and one-shot chat paths pick it up.

2. The model-loading dialog now shows a live CPU / RAM / GPU readout
   (and disk free) so the user can see why a load is slow — e.g. RAM
   near full means the model is paging in from disk. Reuses the
   existing usage_live_snapshot command and the LiveSnapshot type,
   now promoted to types.ts and shared with the Usage settings tab.
---
 src-tauri/src/ollama.rs                |  23 +++
 src-tauri/src/resolver.rs              |   1 +
 src/config.ts                          |   1 +
 src/types.ts                           |  28 ++++
 src/ui/Chat.svelte                     | 194 +++++++++++++++++++++++--
 src/ui/settings/HardwareSection.svelte |  48 ++++++
 src/ui/settings/UsageSection.svelte    |  19 +--
 7 files changed, 280 insertions(+), 34 deletions(-)

diff --git a/src-tauri/src/ollama.rs b/src-tauri/src/ollama.rs
index 1157782..d3a62be 100644
--- a/src-tauri/src/ollama.rs
+++ b/src-tauri/src/ollama.rs
@@ -730,6 +730,27 @@ pub enum ChatStreamOutcome {
     Cancelled,
 }
 
+/// Resolve the user's configured Ollama `keep_alive` for chat
+/// requests. This controls how long Ollama keeps the model resident
+/// in RAM/VRAM after a turn finishes: longer values avoid cold-start
+/// reloads between messages (the common "why is it slow again?"
+/// complaint), shorter values free memory sooner so the LLM can
+/// coexist with transcription on a memory-tight machine. Accepts
+/// Ollama's native duration format — "30m", "1h", "0" (unload
+/// immediately), "-1" (keep until evicted). Falls back to "30m" when
+/// the config is unreadable or the key is absent (older configs).
+fn chat_keep_alive() -> serde_json::Value {
+    crate::resolver::load_config_value()
+        .ok()
+        .and_then(|c| {
+            c.get("ollama_keep_alive")
+                .and_then(|v| v.as_str())
+                .map(str::to_string)
+        })
+        .map(serde_json::Value::from)
+        .unwrap_or_else(|| serde_json::json!("30m"))
+}
+
 /// Streamed chat completion. Invokes `on_content` for each visible token
 /// chunk, `on_thinking` for any reasoning/thinking deltas (thinking models
 /// emit those in `message.thinking`; non-thinking models never call it),
@@ -771,6 +792,7 @@ where
         "model": model,
         "messages": messages,
         "stream": true,
+        "keep_alive": chat_keep_alive(),
     });
     if let Some(t) = think {
         body["think"] = serde_json::json!(t);
@@ -933,6 +955,7 @@ pub async fn chat_once(
         "model": model,
         "messages": messages,
         "stream": false,
+        "keep_alive": chat_keep_alive(),
     });
     if let Some(opts) = options {
         body["options"] = opts;
diff --git a/src-tauri/src/resolver.rs b/src-tauri/src/resolver.rs
index 9a0e92e..ee89bec 100644
--- a/src-tauri/src/resolver.rs
+++ b/src-tauri/src/resolver.rs
@@ -783,6 +783,7 @@ pub fn default_config_value() -> Value {
         "active_family": "gemma4",
         "active_mode": "text",
         "model_cleanup_days": 1,
+        "ollama_keep_alive": "30m",
         "kept_models": [],
         "mode_overrides": {},
         "tracked_modes": ["text"],
diff --git a/src/config.ts b/src/config.ts
index b7bda88..5d2a3d0 100644
--- a/src/config.ts
+++ b/src/config.ts
@@ -111,6 +111,7 @@ const DEFAULT_CONFIG: Config = {
   // active_mode they persisted (mergeDefaults overlays raw on top).
   active_mode: "transcribe",
   model_cleanup_days: 1,
+  ollama_keep_alive: "30m",
   cleanup_warning_suppressed_families: [],
   kept_models: [],
   mode_overrides: {},
diff --git a/src/types.ts b/src/types.ts
index 77d2fe9..98241e5 100644
--- a/src/types.ts
+++ b/src/types.ts
@@ -11,6 +11,27 @@ export interface HardwareProfile {
   soc?: string | null;
 }
 
+/** Live resource snapshot — mirrors the `LiveSnapshot` struct in
+ *  src-tauri/src/usage.rs, returned by the `usage_live_snapshot`
+ *  Tauri command. Every counter is optional on the Rust side so the
+ *  UI renders "—" when a platform doesn't expose it. Shared between
+ *  the Usage settings tab and the chat model-loading dialog so both
+ *  read the same system lookups. */
+export interface LiveSnapshot {
+  cpu_app_pct: number | null;
+  cpu_total_pct: number | null;
+  ram_app_bytes: number | null;
+  ram_total_bytes: number | null;
+  ram_used_bytes: number | null;
+  gpu_pct: number | null;
+  vram_app_bytes: number | null;
+  vram_used_bytes: number | null;
+  vram_total_bytes: number | null;
+  process_uptime_seconds: number;
+  cpu_brand: string | null;
+  cpu_count: number | null;
+}
+
 export type Mode = "text" | "vision" | "code" | "transcribe" | "diarize";
 
 /** Runtimes the resolver knows how to dispatch to.
@@ -469,6 +490,13 @@ export interface Config {
   active_family: string;
   active_mode: Mode;
   model_cleanup_days: number;
+  /** Ollama `keep_alive` for chat requests — how long the model stays
+   *  resident in RAM/VRAM after a turn before Ollama unloads it.
+   *  Native Ollama duration format: "30m", "1h", "0" (unload right
+   *  away, frees memory for transcription), "-1" (keep until evicted).
+   *  Longer values avoid cold-start reloads between messages; shorter
+   *  values suit memory-tight machines. Default "30m". */
+  ollama_keep_alive: string;
   /** Family names for which the user has dismissed the
    *  "switching with auto-cleanup on" confirmation in the family
    *  detail view's per-tier picker. Per-family rather than per-tier
diff --git a/src/ui/Chat.svelte b/src/ui/Chat.svelte
index b4dd2f1..4b2995d 100644
--- a/src/ui/Chat.svelte
+++ b/src/ui/Chat.svelte
@@ -1,5 +1,6 @@
 <script lang="ts">
   import { invoke } from "@tauri-apps/api/core";
+  import { onDestroy } from "svelte";
   import TopBar from "./TopBar.svelte";
   import TextBar from "./TextBar.svelte";
   import SettingsPanel from "./SettingsPanel.svelte";
@@ -14,7 +15,7 @@
     type ToolCall,
   } from "../conversations";
   import type { SettingsTab } from "../update-state.svelte";
-  import type { HardwareProfile, Mode } from "../types";
+  import type { HardwareProfile, Mode, LiveSnapshot } from "../types";
   import {
     chatSlot,
     claimChat,
@@ -132,14 +133,68 @@
   const MODEL_LOAD_POPUP_DELAY_MS = 5000;
   let modelLoading = $state(false);
   let modelLoadTimer: ReturnType<typeof setTimeout> | null = null;
-  /** Clear the load-wait dialog + its arming timer. Idempotent, so
-   *  it's safe to call from every agent event and from cleanup. */
+
+  /** Live CPU/RAM/GPU snapshot shown inside the load-wait dialog so
+   *  the user can see *why* it's slow (e.g. RAM near full → the model
+   *  is paging in from disk). Reuses the same `usage_live_snapshot`
+   *  command + cadence the Usage settings tab uses, so there's a
+   *  single source of truth for system lookups. Polled only while the
+   *  dialog is up; null when idle. */
+  let liveStats = $state<LiveSnapshot | null>(null);
+  let statsPollHandle: ReturnType<typeof setInterval> | null = null;
+  /** Poll cadence for the load-wait readout. Faster than the Usage
+   *  tab's 2s so a short load still gets a real CPU% reading (the
+   *  first sample only primes the delta cache). */
+  const STATS_POLL_MS = 1200;
+
+  async function refreshLiveStats() {
+    try {
+      liveStats = await invoke<LiveSnapshot>("usage_live_snapshot");
+    } catch {
+      // Non-fatal: the dialog still shows the spinner + copy without
+      // the resource readout if the snapshot command is unavailable.
+    }
+  }
+
+  function startStatsPoll() {
+    if (statsPollHandle !== null) return;
+    void refreshLiveStats(); // prime the CPU delta cache immediately
+    statsPollHandle = setInterval(() => void refreshLiveStats(), STATS_POLL_MS);
+  }
+
+  function stopStatsPoll() {
+    if (statsPollHandle !== null) {
+      clearInterval(statsPollHandle);
+      statsPollHandle = null;
+    }
+    liveStats = null;
+  }
+
+  /** Clear the load-wait dialog + its arming timer and stop the
+   *  resource poll. Idempotent, so it's safe to call from every agent
+   *  event and from cleanup. */
   function clearModelLoadWait() {
     if (modelLoadTimer !== null) {
       clearTimeout(modelLoadTimer);
       modelLoadTimer = null;
     }
     if (modelLoading) modelLoading = false;
+    stopStatsPoll();
+  }
+
+  // Belt-and-suspenders: never leak the interval if the panel is torn
+  // down (mode switch, conversation close) mid-load.
+  onDestroy(stopStatsPoll);
+
+  /** Format a byte count as a compact GB string for the readout. */
+  function fmtGb(bytes: number | null | undefined): string {
+    if (bytes == null) return "—";
+    return `${(bytes / 1024 ** 3).toFixed(1)} GB`;
+  }
+  /** RAM-in-use percentage for the mini bar, or null when unknown. */
+  function ramUsedPct(): number | null {
+    if (!liveStats?.ram_used_bytes || !liveStats?.ram_total_bytes) return null;
+    return (liveStats.ram_used_bytes / liveStats.ram_total_bytes) * 100;
   }
 
   /** One pending attachment staged for the next send. Images become
@@ -865,6 +920,7 @@
     // user so. clearModelLoadWait() below disarms it on first frame.
     modelLoadTimer = setTimeout(() => {
       modelLoading = true;
+      startStatsPoll();
     }, MODEL_LOAD_POPUP_DELAY_MS);
 
     try {
@@ -1437,17 +1493,63 @@
          reading the transcript behind it, and Cancel aborts the run. -->
     <div class="model-loading-backdrop" role="dialog" aria-modal="false" aria-live="polite">
       <div class="model-loading-card">
-        <div class="spinner" aria-hidden="true"></div>
-        <div class="model-loading-text">
-          {#if routeViaDevicePubkey}
-            <p class="model-loading-title">Waiting for {routedPeer?.label ?? "the host"}…</p>
-            <p class="model-loading-sub">The host is loading its model. This can take a few seconds.</p>
-          {:else}
-            <p class="model-loading-title">Loading {activeModel}…</p>
-            <p class="model-loading-sub">First use reads the model into memory. This is usually a one-time wait — later replies start instantly.</p>
-          {/if}
+        <div class="model-loading-head">
+          <div class="spinner" aria-hidden="true"></div>
+          <div class="model-loading-text">
+            {#if routeViaDevicePubkey}
+              <p class="model-loading-title">Waiting for {routedPeer?.label ?? "the host"}…</p>
+              <p class="model-loading-sub">The host is loading its model. This can take a few seconds.</p>
+            {:else}
+              <p class="model-loading-title">Loading {activeModel}…</p>
+              <p class="model-loading-sub">First use reads the model into memory. This is usually a one-time wait — later replies start instantly.</p>
+            {/if}
+          </div>
+          <button class="model-loading-cancel" onclick={stop}>Cancel</button>
         </div>
-        <button class="model-loading-cancel" onclick={stop}>Cancel</button>
+
+        <!-- Live resource readout — only meaningful for local loads
+             (a remote model loads on the host's machine, not this
+             one). Bars + figures come from usage_live_snapshot, the
+             same lookup the Usage settings tab uses. -->
+        {#if !routeViaDevicePubkey}
+          <div class="model-loading-stats">
+            <div class="stat">
+              <div class="stat-row">
+                <span class="stat-label">CPU</span>
+                <span class="stat-val">{liveStats?.cpu_total_pct != null ? `${Math.round(liveStats.cpu_total_pct)}%` : "—"}</span>
+              </div>
+              <div class="meter"><div class="meter-fill" style="width: {Math.min(100, Math.max(0, liveStats?.cpu_total_pct ?? 0))}%"></div></div>
+            </div>
+            <div class="stat">
+              <div class="stat-row">
+                <span class="stat-label">RAM</span>
+                <span class="stat-val">
+                  {#if liveStats?.ram_used_bytes != null && liveStats?.ram_total_bytes != null}
+                    {fmtGb(liveStats.ram_used_bytes)} / {fmtGb(liveStats.ram_total_bytes)}
+                  {:else}—{/if}
+                </span>
+              </div>
+              <div class="meter"><div class="meter-fill" class:hot={(ramUsedPct() ?? 0) >= 90} style="width: {Math.min(100, Math.max(0, ramUsedPct() ?? 0))}%"></div></div>
+            </div>
+            {#if liveStats?.gpu_pct != null || liveStats?.vram_total_bytes != null}
+              <div class="stat">
+                <div class="stat-row">
+                  <span class="stat-label">GPU</span>
+                  <span class="stat-val">
+                    {liveStats?.gpu_pct != null ? `${Math.round(liveStats.gpu_pct)}%` : "—"}
+                    {#if liveStats?.vram_used_bytes != null && liveStats?.vram_total_bytes != null}
+                      <span class="stat-sub">· VRAM {fmtGb(liveStats.vram_used_bytes)} / {fmtGb(liveStats.vram_total_bytes)}</span>
+                    {/if}
+                  </span>
+                </div>
+                <div class="meter"><div class="meter-fill" style="width: {Math.min(100, Math.max(0, liveStats?.gpu_pct ?? 0))}%"></div></div>
+              </div>
+            {/if}
+            {#if hardware?.disk_free_gb != null}
+              <p class="stat-disk">Disk free: {hardware.disk_free_gb.toFixed(1)} GB</p>
+            {/if}
+          </div>
+        {/if}
       </div>
     </div>
   {/if}
@@ -1497,15 +1599,21 @@
   .model-loading-card {
     pointer-events: auto;
     display: flex;
-    align-items: center;
-    gap: 0.9rem;
-    max-width: 24rem;
+    flex-direction: column;
+    gap: 0.85rem;
+    width: 25rem;
+    max-width: calc(100% - 2rem);
     padding: 1.1rem 1.25rem;
     background: #181818;
     border: 1px solid #2a2a2a;
     border-radius: 12px;
     box-shadow: 0 12px 40px rgba(0, 0, 0, 0.5);
   }
+  .model-loading-head {
+    display: flex;
+    align-items: center;
+    gap: 0.9rem;
+  }
   .model-loading-card .spinner {
     flex: none;
     width: 24px;
@@ -1519,6 +1627,7 @@
     to { transform: rotate(360deg); }
   }
   .model-loading-text {
+    flex: 1;
     min-width: 0;
   }
   .model-loading-title {
@@ -1548,6 +1657,59 @@
     border-color: #3a3a55;
     color: #ddd;
   }
+  .model-loading-stats {
+    display: flex;
+    flex-direction: column;
+    gap: 0.55rem;
+    padding-top: 0.85rem;
+    border-top: 1px solid #242424;
+  }
+  .model-loading-stats .stat {
+    display: flex;
+    flex-direction: column;
+    gap: 0.25rem;
+  }
+  .model-loading-stats .stat-row {
+    display: flex;
+    justify-content: space-between;
+    align-items: baseline;
+    gap: 0.5rem;
+    font-size: 0.76rem;
+  }
+  .model-loading-stats .stat-label {
+    color: #888;
+    text-transform: uppercase;
+    letter-spacing: 0.04em;
+    font-size: 0.68rem;
+  }
+  .model-loading-stats .stat-val {
+    color: #ccc;
+    font-variant-numeric: tabular-nums;
+  }
+  .model-loading-stats .stat-sub {
+    color: #777;
+    font-size: 0.72rem;
+  }
+  .model-loading-stats .meter {
+    height: 5px;
+    background: #242424;
+    border-radius: 3px;
+    overflow: hidden;
+  }
+  .model-loading-stats .meter-fill {
+    height: 100%;
+    background: #6e6ef7;
+    border-radius: 3px;
+    transition: width 0.4s ease;
+  }
+  .model-loading-stats .meter-fill.hot {
+    background: #e35a5a;
+  }
+  .model-loading-stats .stat-disk {
+    margin: 0.1rem 0 0;
+    font-size: 0.72rem;
+    color: #777;
+  }
   .chat-body {
     flex: 1;
     min-height: 0;
diff --git a/src/ui/settings/HardwareSection.svelte b/src/ui/settings/HardwareSection.svelte
index 8cf37c5..9133c7e 100644
--- a/src/ui/settings/HardwareSection.svelte
+++ b/src/ui/settings/HardwareSection.svelte
@@ -15,6 +15,23 @@
   let conversationDir = $state("");
   let loading = $state(true);
   let error = $state("");
+  /** Ollama `keep_alive` for chat — how long the model stays resident
+   *  in memory after a turn. Longer avoids cold-start reloads between
+   *  messages; shorter frees RAM/VRAM for transcription on tight
+   *  machines. Stored in Ollama's native duration format. */
+  let keepAlive = $state("30m");
+  const KEEP_ALIVE_OPTIONS: { value: string; label: string }[] = [
+    { value: "0", label: "Unload immediately (lowest memory)" },
+    { value: "5m", label: "5 minutes (Ollama default)" },
+    { value: "30m", label: "30 minutes (recommended)" },
+    { value: "1h", label: "1 hour" },
+    { value: "-1", label: "Until the app quits (keep resident)" },
+  ];
+
+  async function patchKeepAlive(value: string) {
+    keepAlive = value;
+    await updateConfig({ ollama_keep_alive: value });
+  }
   /** Tag the resolver picks for transcribe against the active family +
    *  hardware. Resolved here (not just described) so users can confirm
    *  the active whisper model from this tab without bouncing to Models. */
@@ -58,6 +75,7 @@
       ]);
       hardware = hw;
       conversationDir = config.conversation_dir ?? "";
+      keepAlive = config.ollama_keep_alive ?? "30m";
       mic = { ...config.mic };
       micDevices = devices;
       if (manifest) {
@@ -249,6 +267,36 @@
         </dl>
       </div>
 
+      <div class="group-label">Performance</div>
+
+      <div class="card">
+        <div class="card-title">Model memory</div>
+        <p class="card-meta">
+          How long the chat model stays loaded in memory after a reply.
+          Longer keeps later messages instant; shorter frees RAM/VRAM
+          sooner — handy when transcription needs to run alongside on a
+          memory-tight machine.
+        </p>
+        <dl class="info">
+          <div class="full">
+            <dt>Keep model loaded for</dt>
+            <dd>
+              <select
+                value={keepAlive}
+                onchange={(e) => patchKeepAlive((e.currentTarget as HTMLSelectElement).value)}
+              >
+                {#if !KEEP_ALIVE_OPTIONS.some((o) => o.value === keepAlive)}
+                  <option value={keepAlive}>Custom: {keepAlive}</option>
+                {/if}
+                {#each KEEP_ALIVE_OPTIONS as opt (opt.value)}
+                  <option value={opt.value}>{opt.label}</option>
+                {/each}
+              </select>
+            </dd>
+          </div>
+        </dl>
+      </div>
+
       <div class="group-label">Storage</div>
 
       <div class="card">
diff --git a/src/ui/settings/UsageSection.svelte b/src/ui/settings/UsageSection.svelte
index 4912282..150fd21 100644
--- a/src/ui/settings/UsageSection.svelte
+++ b/src/ui/settings/UsageSection.svelte
@@ -2,24 +2,7 @@
   import { onMount, onDestroy } from "svelte";
   import { invoke } from "@tauri-apps/api/core";
   import { scrollAffordance } from "../scroll-affordance";
-
-  // Mirrors the LiveSnapshot struct in src-tauri/src/usage.rs. Every
-  // field is optional on the Rust side so we can render "—" when a
-  // platform doesn't expose the underlying counter.
-  interface LiveSnapshot {
-    cpu_app_pct: number | null;
-    cpu_total_pct: number | null;
-    ram_app_bytes: number | null;
-    ram_total_bytes: number | null;
-    ram_used_bytes: number | null;
-    gpu_pct: number | null;
-    vram_app_bytes: number | null;
-    vram_used_bytes: number | null;
-    vram_total_bytes: number | null;
-    process_uptime_seconds: number;
-    cpu_brand: string | null;
-    cpu_count: number | null;
-  }
+  import type { LiveSnapshot } from "../../types";
 
   // Mirrors UsageStats in src-tauri/src/usage.rs.
   interface UsageStats {

From d90e67415d33af8f8ef6d37ed5486f4af54c60d3 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Fri, 29 May 2026 04:54:49 +0000
Subject: [PATCH 03/12] Throttle model loads and show the dialog before the
 freeze

Addresses laptops freezing so hard during a cold model load that the
load dialog never paints, and warms the model without locking up the
machine:

- Throttle the Ollama server we spawn: lower its IO priority (Linux
  ionice idle + small renice; macOS taskpolicy -b; Windows BelowNormal)
  so the disk thrash of paging weights in no longer starves the
  desktop. Best-effort, only when we own the process.
- Cap memory pressure via OLLAMA_MAX_LOADED_MODELS=1 and
  OLLAMA_NUM_PARALLEL=1 on spawn, cutting the swap thrash that causes
  the hardest freezes.
- Pre-paint the load dialog: before sending, check /api/ps to see if
  the model is resident; on a predicted cold start, show the dialog and
  force a paint BEFORE firing the request so it's on screen ahead of
  any freeze. Warm starts keep the lightweight 5s reactive timer.
- Warm the chat model in the background at startup (throttled) so the
  one-time cold load happens at a predictable moment instead of on the
  first message. Skipped when the model is missing or keep_alive is 0.
- warm() now honors the configured keep_alive instead of a fixed 10m.
---
 src-tauri/src/main.rs    | 21 +++++++++++++++++
 src-tauri/src/ollama.rs  | 45 ++++++++++++++++++++++++++++++++++--
 src-tauri/src/process.rs | 49 ++++++++++++++++++++++++++++++++++++++++
 src/ui/App.svelte        | 11 +++++++++
 src/ui/Chat.svelte       | 43 ++++++++++++++++++++++++++++++-----
 5 files changed, 161 insertions(+), 8 deletions(-)

diff --git a/src-tauri/src/main.rs b/src-tauri/src/main.rs
index 321d46e..55767d5 100644
--- a/src-tauri/src/main.rs
+++ b/src-tauri/src/main.rs
@@ -73,6 +73,25 @@ async fn ollama_list_models() -> Result<Vec<ollama::ModelInfo>, String> {
     ollama::list_models().await.map_err(|e| e.to_string())
 }
 
+/// True when `model` is already resident in Ollama's memory, so the
+/// next chat won't cold-load. The chat UI uses this to decide whether
+/// to paint the load dialog *before* firing the request (a cold load
+/// can thrash the machine hard enough that a delayed dialog never
+/// renders). Best-effort: returns false if Ollama can't be reached.
+#[tauri::command]
+async fn ollama_model_loaded(model: String) -> bool {
+    ollama::is_model_loaded(&model).await
+}
+
+/// Proactively load `model` into memory at the throttled server's low
+/// priority, so the one-time cold load happens at a predictable moment
+/// (e.g. just after launch) instead of freezing the user mid-chat.
+#[tauri::command]
+async fn ollama_warm(model: String) -> Result<(), String> {
+    ollama::ensure_running().await.map_err(|e| e.to_string())?;
+    ollama::warm(&model).await.map_err(|e| e.to_string())
+}
+
 #[tauri::command]
 async fn ollama_delete_model(name: String) -> Result<(), String> {
     ollama::delete_model(&name).await.map_err(|e| e.to_string())
@@ -1005,6 +1024,8 @@ fn main() {
             ollama_install,
             ollama_stop,
             ollama_list_models,
+            ollama_model_loaded,
+            ollama_warm,
             ollama_delete_model,
             preload_modes,
             ensure_tracked_models,
diff --git a/src-tauri/src/ollama.rs b/src-tauri/src/ollama.rs
index d3a62be..bcc8f3a 100644
--- a/src-tauri/src/ollama.rs
+++ b/src-tauri/src/ollama.rs
@@ -284,11 +284,25 @@ pub async fn ensure_running() -> Result<()> {
     let child = quiet_tokio_command("ollama")
         .arg("serve")
         .env("OLLAMA_ORIGINS", "*")
+        // Cap memory pressure on the laptop-class machines that freeze
+        // hard while a model pages in: keep at most one model resident,
+        // and serve one request at a time, so Ollama never tries to
+        // hold two models (or N parallel KV caches) in RAM/VRAM at once.
+        .env("OLLAMA_MAX_LOADED_MODELS", "1")
+        .env("OLLAMA_NUM_PARALLEL", "1")
         .stdout(Stdio::null())
         .stderr(Stdio::null())
         .spawn()
         .context("failed to spawn ollama serve")?;
 
+    // Throttle the server we just spawned so the disk thrash of loading
+    // a model doesn't lock up the whole desktop. Best-effort, and only
+    // possible because WE own this process — when Ollama is already
+    // running as a system/tray service we never reach this branch.
+    if let Some(pid) = child.id() {
+        crate::process::lower_priority(pid).await;
+    }
+
     *guard = Some(child);
 
     // Wait up to 10 seconds for API to become reachable.
@@ -595,14 +609,41 @@ pub async fn has_model(model: &str) -> Result<bool> {
     Ok(out.success())
 }
 
+/// True when `model` is currently loaded in Ollama's memory — i.e. the
+/// next chat won't pay a cold load. Queried via `/api/ps`. On any error
+/// (Ollama down, curl missing, unparseable body) we return `false` so
+/// callers fall back to showing the load dialog rather than wrongly
+/// skipping it. Ollama reports loaded models under either `name` or
+/// `model`, so we match on both.
+pub async fn is_model_loaded(model: &str) -> bool {
+    let Ok(body) = reqwest_get("http://127.0.0.1:11434/api/ps").await else {
+        return false;
+    };
+    let Ok(v) = serde_json::from_str::<serde_json::Value>(&body) else {
+        return false;
+    };
+    v.get("models")
+        .and_then(|m| m.as_array())
+        .map(|arr| {
+            arr.iter().any(|e| {
+                e.get("name").and_then(|n| n.as_str()) == Some(model)
+                    || e.get("model").and_then(|n| n.as_str()) == Some(model)
+            })
+        })
+        .unwrap_or(false)
+}
+
 /// Fire a 1-token chat call so Ollama mmaps the weights and keeps the model loaded
-/// for `keep_alive`. Used by `myownllm preload --warm`.
+/// for `keep_alive`. Used by `myownllm preload --warm` and the startup warm.
+/// Honors the user's configured `keep_alive` so a proactive warm respects
+/// the same residency window as real chats (warming with "0" would just
+/// load-then-unload, so callers skip warming in that case).
 pub async fn warm(model: &str) -> Result<()> {
     let body = serde_json::json!({
         "model": model,
         "messages": [{"role": "user", "content": "ok"}],
         "stream": false,
-        "keep_alive": "10m",
+        "keep_alive": chat_keep_alive(),
         "options": { "num_predict": 1 }
     })
     .to_string();
diff --git a/src-tauri/src/process.rs b/src-tauri/src/process.rs
index 9d1eeab..8e5dddb 100644
--- a/src-tauri/src/process.rs
+++ b/src-tauri/src/process.rs
@@ -46,3 +46,52 @@ fn apply_quiet_flags_tokio(cmd: &mut tokio::process::Command) {
 }
 #[cfg(not(target_os = "windows"))]
 fn apply_quiet_flags_tokio(_cmd: &mut tokio::process::Command) {}
+
+/// Best-effort: drop a child process's scheduling priority so heavy,
+/// bursty work — notably an LLM server paging multi-GB weights in from
+/// disk on first use — doesn't starve the desktop and freeze the whole
+/// machine. We lower **IO** priority where the platform exposes it,
+/// since model loading is disk-bound and that's the real lever, plus a
+/// gentle CPU nice. Every call is fire-and-forget: a missing tool or a
+/// permission error just means no throttle, never a hard failure.
+#[allow(unused_variables)] // `pid` is unused on platforms without a branch
+pub async fn lower_priority(pid: u32) {
+    let pid = pid.to_string();
+    #[cfg(target_os = "linux")]
+    {
+        // ionice class 3 = "idle": the process only gets disk time when
+        // nothing else wants it. This is what keeps the UI painting
+        // (and our load dialog visible) while the model streams in.
+        let _ = quiet_tokio_command("ionice")
+            .args(["-c", "3", "-p", &pid])
+            .status()
+            .await;
+        // A small CPU nudge — not a full demotion — so inference still
+        // feels snappy once the model is resident.
+        let _ = quiet_tokio_command("renice")
+            .args(["-n", "5", "-p", &pid])
+            .status()
+            .await;
+    }
+    #[cfg(target_os = "macos")]
+    {
+        // taskpolicy -b moves the process into the background QoS tier,
+        // throttling both CPU and disk IO — macOS's closest equivalent
+        // to Linux's ionice idle class.
+        let _ = quiet_tokio_command("taskpolicy")
+            .args(["-b", "-p", &pid])
+            .status()
+            .await;
+    }
+    #[cfg(target_os = "windows")]
+    {
+        // Windows doesn't expose per-process IO priority to other
+        // processes without FFI; dropping the priority class to
+        // BelowNormal still de-prioritizes the load against the UI.
+        let script = format!("(Get-Process -Id {pid}).PriorityClass='BelowNormal'");
+        let _ = quiet_tokio_command("powershell")
+            .args(["-NoProfile", "-NonInteractive", "-Command", &script])
+            .status()
+            .await;
+    }
+}
diff --git a/src/ui/App.svelte b/src/ui/App.svelte
index acf7ef7..17871c0 100644
--- a/src/ui/App.svelte
+++ b/src/ui/App.svelte
@@ -276,6 +276,17 @@
       // run the install lazily).
       view = "chat";
       invoke("ollama_ensure_running").catch(() => {});
+
+      // Proactively warm the chat model in the background so its
+      // one-time cold load happens now — with the throttled server
+      // keeping the machine responsive — rather than freezing the user
+      // on their first message. Skipped when the model isn't on disk
+      // yet (the download overlay owns that flow) or when keep_alive is
+      // "0" (warming would just load-then-unload). Fire-and-forget.
+      if (pendingTextModel && !textModelMissing && config.ollama_keep_alive !== "0") {
+        invoke("ollama_warm", { model: pendingTextModel }).catch(() => {});
+      }
+
       kickUpdateCheck();
 
       // Seed the sidebar early so it's ready when the chat view paints.
diff --git a/src/ui/Chat.svelte b/src/ui/Chat.svelte
index 4b2995d..e905a4f 100644
--- a/src/ui/Chat.svelte
+++ b/src/ui/Chat.svelte
@@ -1,6 +1,6 @@
 <script lang="ts">
   import { invoke } from "@tauri-apps/api/core";
-  import { onDestroy } from "svelte";
+  import { onDestroy, tick } from "svelte";
   import TopBar from "./TopBar.svelte";
   import TextBar from "./TextBar.svelte";
   import SettingsPanel from "./SettingsPanel.svelte";
@@ -186,6 +186,20 @@
   // down (mode switch, conversation close) mid-load.
   onDestroy(stopStatsPoll);
 
+  /** Resolve once the browser has actually painted: a Svelte tick to
+   *  flush the DOM update, then two animation frames so the compositor
+   *  draws the frame. We await this after showing the load dialog and
+   *  before kicking off a cold model load — a heavy load can thrash a
+   *  laptop badly enough that an un-painted dialog would never appear. */
+  function nextPaint(): Promise<void> {
+    return tick().then(
+      () =>
+        new Promise<void>((resolve) => {
+          requestAnimationFrame(() => requestAnimationFrame(() => resolve()));
+        }),
+    );
+  }
+
   /** Format a byte count as a compact GB string for the readout. */
   function fmtGb(bytes: number | null | undefined): string {
     if (bytes == null) return "—";
@@ -915,13 +929,30 @@
       enabledToolSet.has(t.definition.function.name),
     );
 
-    // Arm the cold-start dialog: if no frame has come back by the
-    // delay, the model is (re)loading into memory and we tell the
-    // user so. clearModelLoadWait() below disarms it on first frame.
-    modelLoadTimer = setTimeout(() => {
+    // Cold-start dialog. A model that isn't resident yet can thrash a
+    // laptop hard enough that a reactively-delayed dialog never gets to
+    // paint — so when we can confirm (via Ollama's /api/ps) that the
+    // model isn't loaded, we show the dialog and force a paint BEFORE
+    // firing the request. For the warm case, the remote path, or an
+    // unknown ps result, we keep the lightweight 5s reactive timer.
+    let coldStart = false;
+    if (!routeViaDevicePubkey) {
+      try {
+        coldStart = !(await invoke<boolean>("ollama_model_loaded", { model: activeModel }));
+      } catch {
+        // ps unavailable — fall back to the reactive timer below.
+      }
+    }
+    if (coldStart) {
       modelLoading = true;
       startStatsPoll();
-    }, MODEL_LOAD_POPUP_DELAY_MS);
+      await nextPaint(); // get the dialog on screen before the load freeze
+    } else {
+      modelLoadTimer = setTimeout(() => {
+        modelLoading = true;
+        startStatsPoll();
+      }, MODEL_LOAD_POPUP_DELAY_MS);
+    }
 
     try {
       await runAgent({

From 2dd5b8bfd4e8129580776e38f43c61730d55c9d7 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Fri, 29 May 2026 05:04:54 +0000
Subject: [PATCH 04/12] Report system CPU% and RAM-used on macOS
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The live usage sampler returned None for system-wide CPU% and RAM-used
on macOS (only the per-app figures populated), so the load dialog and
Usage tab showed app metrics but blank system metrics.

- System CPU%: sum every process's ps %cpu and normalise by core count
  (single fast call; no host_statistics FFI, no top -l 2 stall).
- System RAM used: (active + wired + compressed) pages x page size from
  vm_stat — the components Activity Monitor reports as Memory Used —
  using vm_stat's own header page size for self-consistent math.

Parsing is factored into pure helpers with unit tests so the logic is
verified on any host even though the macOS shell calls only run there.
---
 src-tauri/src/usage.rs | 158 +++++++++++++++++++++++++++++++++++++++--
 1 file changed, 152 insertions(+), 6 deletions(-)

diff --git a/src-tauri/src/usage.rs b/src-tauri/src/usage.rs
index 781ef9a..ef7df08 100644
--- a/src-tauri/src/usage.rs
+++ b/src-tauri/src/usage.rs
@@ -281,6 +281,67 @@ fn cpu_count() -> Option<u32> {
         .map(|n| n.get() as u32)
 }
 
+/// Sum a `ps -A -o %cpu=` dump (one float per process, each already a
+/// share of a single core) and normalise by `cpus` into a 0..100 share
+/// of total system CPU. `None` when nothing parses. Pure so it can be
+/// unit-tested on any host even though its only caller is macOS.
+#[cfg_attr(not(target_os = "macos"), allow(dead_code))]
+fn parse_total_cpu_pct(ps_output: &str, cpus: f64) -> Option<f64> {
+    let cpus = if cpus <= 0.0 { 1.0 } else { cpus };
+    let mut sum = 0.0;
+    let mut any = false;
+    for tok in ps_output.split_whitespace() {
+        if let Ok(v) = tok.parse::<f64>() {
+            sum += v;
+            any = true;
+        }
+    }
+    if !any {
+        return None;
+    }
+    Some((sum / cpus).clamp(0.0, 100.0))
+}
+
+/// Pull a page count out of a `vm_stat` line like
+/// `Pages active:    123456.` — digits only, trailing '.' dropped.
+#[cfg_attr(not(target_os = "macos"), allow(dead_code))]
+fn parse_vm_stat_pages(vm_stat: &str, key: &str) -> Option<u64> {
+    for line in vm_stat.lines() {
+        if let Some(rest) = line.trim_start().strip_prefix(key) {
+            let digits: String = rest.chars().filter(|c| c.is_ascii_digit()).collect();
+            if !digits.is_empty() {
+                return digits.parse::<u64>().ok();
+            }
+        }
+    }
+    None
+}
+
+/// Read the page size from a `vm_stat` header line, e.g.
+/// "Mach Virtual Memory Statistics: (page size of 16384 bytes)". Using
+/// the dump's own page size keeps the byte math consistent with its
+/// page counts (Apple Silicon is 16 KiB, Intel 4 KiB, and `hw.pagesize`
+/// doesn't always agree with the VM page size).
+#[cfg_attr(not(target_os = "macos"), allow(dead_code))]
+fn parse_vm_stat_page_size(vm_stat: &str) -> Option<u64> {
+    let line = vm_stat.lines().next()?;
+    let after = line.split("page size of").nth(1)?;
+    let tok = after.split_whitespace().next()?;
+    let digits: String = tok.chars().filter(|c| c.is_ascii_digit()).collect();
+    digits.parse::<u64>().ok().filter(|&n| n > 0)
+}
+
+/// macOS system "used" memory ≈ (active + wired + compressed) pages ×
+/// page size — the same components Activity Monitor reports as "Memory
+/// Used". `None` if any component line is absent.
+#[cfg_attr(not(target_os = "macos"), allow(dead_code))]
+fn parse_vm_stat_used_bytes(vm_stat: &str, page_bytes: u64) -> Option<u64> {
+    let active = parse_vm_stat_pages(vm_stat, "Pages active:")?;
+    let wired = parse_vm_stat_pages(vm_stat, "Pages wired down:")?;
+    let compressed = parse_vm_stat_pages(vm_stat, "Pages occupied by compressor:")?;
+    Some((active + wired + compressed).saturating_mul(page_bytes))
+}
+
 #[cfg(target_os = "linux")]
 fn cpu_brand() -> Option<String> {
     let content = std::fs::read_to_string("/proc/cpuinfo").ok()?;
@@ -450,9 +511,20 @@ fn sample_cpu() -> (Option<f64>, Option<f64>) {
         .and_then(|b| String::from_utf8(b).ok())
         .and_then(|s| s.trim().parse::<f64>().ok())
         .map(|v| (v / cpus).clamp(0.0, 100.0));
-    // Total system CPU% on macOS would need host_statistics — skip and
-    // leave as None. The UI handles the missing value cleanly.
-    (app_pct, None)
+    // System CPU%: sum every process's ps %cpu (each a share of one
+    // core) and normalise by core count. ps reports a decaying average
+    // rather than a true instant, but it's a single fast call — no
+    // host_statistics FFI and no `top -l 2` second-sample stall that
+    // would block the poll — and tracks "is the machine busy" well
+    // enough for the load readout.
+    let total_pct = quiet_command("ps")
+        .args(["-A", "-o", "%cpu="])
+        .output()
+        .ok()
+        .filter(|o| o.status.success())
+        .and_then(|o| String::from_utf8(o.stdout).ok())
+        .and_then(|s| parse_total_cpu_pct(&s, cpus));
+    (app_pct, total_pct)
 }
 
 #[cfg(target_os = "windows")]
@@ -635,9 +707,30 @@ fn sample_ram() -> (Option<u64>, Option<u64>, Option<u64>) {
         })
         .and_then(|b| String::from_utf8(b).ok())
         .and_then(|s| s.trim().parse::<u64>().ok());
-    // System "used" via `vm_stat` page-counting is fiddly — leave as None;
-    // the UI handles missing values.
-    (app, total, None)
+    // System "used" ≈ (active + wired + compressed) pages × page size,
+    // the components Activity Monitor sums as "Memory Used". Page size
+    // differs by arch (16 KiB on Apple Silicon, 4 KiB on Intel), so
+    // read it rather than assume.
+    let page = quiet_command("sysctl")
+        .args(["-n", "hw.pagesize"])
+        .output()
+        .ok()
+        .filter(|o| o.status.success())
+        .and_then(|o| String::from_utf8(o.stdout).ok())
+        .and_then(|s| s.trim().parse::<u64>().ok())
+        .unwrap_or(4096);
+    let used = quiet_command("vm_stat")
+        .output()
+        .ok()
+        .filter(|o| o.status.success())
+        .and_then(|o| String::from_utf8(o.stdout).ok())
+        .and_then(|s| {
+            // Prefer vm_stat's own header page size; fall back to the
+            // sysctl value, then a 4 KiB default.
+            let page = parse_vm_stat_page_size(&s).unwrap_or(page);
+            parse_vm_stat_used_bytes(&s, page)
+        });
+    (app, total, used)
 }
 
 #[cfg(target_os = "windows")]
@@ -829,3 +922,56 @@ fn nvidia_app_vram_bytes() -> Option<u64> {
     }
     None
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn total_cpu_pct_sums_and_normalises() {
+        // Four processes at 50% of one core each, on a 4-core box → 50%.
+        let out = "50.0\n50.0\n50.0\n50.0\n";
+        let pct = parse_total_cpu_pct(out, 4.0).unwrap();
+        assert!((pct - 50.0).abs() < 1e-6, "got {pct}");
+    }
+
+    #[test]
+    fn total_cpu_pct_handles_blanks_and_clamps() {
+        assert_eq!(parse_total_cpu_pct("", 4.0), None);
+        assert_eq!(parse_total_cpu_pct("   \n  \n", 4.0), None);
+        // Over-100% sum (transient ps quirk) clamps to 100.
+        assert_eq!(parse_total_cpu_pct("800.0\n", 4.0).unwrap(), 100.0);
+        // Zero/garbage cpu count is treated as 1, not a divide-by-zero.
+        assert_eq!(parse_total_cpu_pct("10.0\n", 0.0).unwrap(), 10.0);
+    }
+
+    #[test]
+    fn vm_stat_used_sums_active_wired_compressed() {
+        let vm = "Mach Virtual Memory Statistics: (page size of 16384 bytes)\n\
+Pages free:                          100000.\n\
+Pages active:                        200000.\n\
+Pages inactive:                      150000.\n\
+Pages speculative:                     5000.\n\
+Pages wired down:                    100000.\n\
+Pages occupied by compressor:         50000.\n";
+        // (200000 + 100000 + 50000) pages × 16384 bytes.
+        let used = parse_vm_stat_used_bytes(vm, 16384).unwrap();
+        assert_eq!(used, 350_000u64 * 16384);
+    }
+
+    #[test]
+    fn vm_stat_page_size_parsed_from_header() {
+        let vm = "Mach Virtual Memory Statistics: (page size of 16384 bytes)\n\
+Pages active: 1.\n";
+        assert_eq!(parse_vm_stat_page_size(vm), Some(16384));
+        assert_eq!(parse_vm_stat_page_size("no header here"), None);
+    }
+
+    #[test]
+    fn vm_stat_used_none_when_a_component_missing() {
+        // Has active + wired but no compressor line → None.
+        let vm = "Pages active: 10.\nPages wired down: 20.\n";
+        assert_eq!(parse_vm_stat_used_bytes(vm, 4096), None);
+        assert_eq!(parse_vm_stat_used_bytes("garbage", 4096), None);
+    }
+}

From 3f00490e84875905447246c6a995777ccd7ffa5b Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Fri, 29 May 2026 05:07:05 +0000
Subject: [PATCH 05/12] Clean up pre-existing compiler/clippy warnings

Mechanical, behavior-preserving cleanup via cargo fix / clippy --fix:
- Remove unused re-exports (mesh identity/roster/signing).
- Inline format args in format!/anyhow!/write! across asr, diarize,
  mesh, transcribe, cli, main.

cargo check is now warning-free. The remaining clippy-only lints
(result_large_err, doc-list indentation) need invasive manual changes
and are left for a focused follow-up.
---
 src-tauri/src/asr/moonshine.rs     | 4 ++--
 src-tauri/src/asr/parakeet.rs      | 2 +-
 src-tauri/src/cli.rs               | 7 ++-----
 src-tauri/src/diarize/embedder.rs  | 3 +--
 src-tauri/src/diarize/segmenter.rs | 3 +--
 src-tauri/src/main.rs              | 2 +-
 src-tauri/src/mesh/daemon.rs       | 6 +++---
 src-tauri/src/mesh/identity.rs     | 2 +-
 src-tauri/src/mesh/roster.rs       | 2 +-
 src-tauri/src/mesh/signing.rs      | 2 +-
 src-tauri/src/transcribe.rs        | 6 ++----
 11 files changed, 16 insertions(+), 23 deletions(-)

diff --git a/src-tauri/src/asr/moonshine.rs b/src-tauri/src/asr/moonshine.rs
index df1e59b..201c159 100644
--- a/src-tauri/src/asr/moonshine.rs
+++ b/src-tauri/src/asr/moonshine.rs
@@ -673,7 +673,7 @@ impl MoonshineBackend {
             let arr = if use_cache {
                 kv.values[idx]
                     .as_ref()
-                    .ok_or_else(|| anyhow!("KV cache slot {} unpopulated under use_cache", idx))?
+                    .ok_or_else(|| anyhow!("KV cache slot {idx} unpopulated under use_cache"))?
                     .clone()
             } else {
                 let resolved_shape: Vec<usize> = past
@@ -736,7 +736,7 @@ impl MoonshineBackend {
             .map_err(|e| anyhow!("ort extract logits: {e}"))?;
         let shape = logits_view.shape().to_vec();
         if shape.len() != 3 || shape[0] != 1 {
-            return Err(anyhow!("unexpected decoder logits shape {:?}", shape));
+            return Err(anyhow!("unexpected decoder logits shape {shape:?}"));
         }
         let last = shape[1] - 1;
         let vocab = shape[2];
diff --git a/src-tauri/src/asr/parakeet.rs b/src-tauri/src/asr/parakeet.rs
index d810e05..db156f6 100644
--- a/src-tauri/src/asr/parakeet.rs
+++ b/src-tauri/src/asr/parakeet.rs
@@ -127,7 +127,7 @@ impl AsrBackend for ParakeetBackend {
                 .map_err(|e| anyhow!("ort threads: {e}"))?
                 .commit_from_file(&model_path_owned)
                 .map_err(|e| anyhow!("loading {}: {e}", model_path_owned.display()))
-                .with_context(|| format!("warm_up parakeet {}", model_name_owned))
+                .with_context(|| format!("warm_up parakeet {model_name_owned}"))
         })?;
 
         // Sniff I/O names. NeMo's istupakov export uses
diff --git a/src-tauri/src/cli.rs b/src-tauri/src/cli.rs
index fd69db0..a13bd85 100644
--- a/src-tauri/src/cli.rs
+++ b/src-tauri/src/cli.rs
@@ -1052,13 +1052,10 @@ async fn cmd_fetch_onnxruntime() -> Result<()> {
             let pct = (bytes as f64 / total as f64 * 100.0) as u64;
             let _ = write!(
                 std::io::stderr(),
-                "\r  {:>3}%  {:>10} / {:>10} bytes",
-                pct,
-                bytes,
-                total
+                "\r  {pct:>3}%  {bytes:>10} / {total:>10} bytes"
             );
         } else {
-            let _ = write!(std::io::stderr(), "\r  {:>10} bytes", bytes);
+            let _ = write!(std::io::stderr(), "\r  {bytes:>10} bytes");
         }
         let _ = std::io::stderr().flush();
     });
diff --git a/src-tauri/src/diarize/embedder.rs b/src-tauri/src/diarize/embedder.rs
index 88d7373..0479030 100644
--- a/src-tauri/src/diarize/embedder.rs
+++ b/src-tauri/src/diarize/embedder.rs
@@ -235,8 +235,7 @@ impl Embedder {
         let shape = view.shape().to_vec();
         if shape.last().copied().unwrap_or(0) == 0 {
             return Err(anyhow!(
-                "embedder produced zero-length output ({:?})",
-                shape
+                "embedder produced zero-length output ({shape:?})"
             ));
         }
         let mut out: Vec<f32> = view.iter().copied().collect();
diff --git a/src-tauri/src/diarize/segmenter.rs b/src-tauri/src/diarize/segmenter.rs
index 27a09f0..4c323da 100644
--- a/src-tauri/src/diarize/segmenter.rs
+++ b/src-tauri/src/diarize/segmenter.rs
@@ -227,8 +227,7 @@ impl Segmenter {
         let shape = logits.shape().to_vec();
         if shape.len() != 3 || shape[0] != 1 || shape[2] != 7 {
             return Err(anyhow!(
-                "unexpected segmenter output shape {:?} (want [1, T, 7])",
-                shape
+                "unexpected segmenter output shape {shape:?} (want [1, T, 7])"
             ));
         }
         let t_frames = shape[1];
diff --git a/src-tauri/src/main.rs b/src-tauri/src/main.rs
index 55767d5..35a04e8 100644
--- a/src-tauri/src/main.rs
+++ b/src-tauri/src/main.rs
@@ -651,7 +651,7 @@ fn mesh_file_save_at(path: String, bytes_b64: String) -> Result<(), String> {
     }
     let target = std::path::PathBuf::from(&path);
     if target.is_dir() {
-        return Err(format!("target {} is a directory", path));
+        return Err(format!("target {path} is a directory"));
     }
     // Best-effort: make sure the parent directory exists. The
     // save-dialog typically lands the user inside an existing folder,
diff --git a/src-tauri/src/mesh/daemon.rs b/src-tauri/src/mesh/daemon.rs
index 033c893..ab4d3b0 100644
--- a/src-tauri/src/mesh/daemon.rs
+++ b/src-tauri/src/mesh/daemon.rs
@@ -576,7 +576,7 @@ fn validate_path_is_executable(path: &Path) -> Result<(), String> {
         0xFEED_FACE | 0xFEED_FACF | 0xCAFE_BABE | 0xBEBA_FECA
     );
     if !(pe || elf || macho) {
-        return Err(format!("bad magic {:02x?}", head));
+        return Err(format!("bad magic {head:02x?}"));
     }
     if pe {
         f.seek(SeekFrom::Start(0x3C))
@@ -633,9 +633,9 @@ pub fn daemon_binary_candidates() -> Vec<PathBuf> {
     // sidecars next to the dev exe; `tauri build` strips it.
     // Checking both covers dev + production from one runtime path.
     let exe_with_triple = if cfg!(windows) {
-        format!("myownmesh-{}.exe", DAEMON_SIDECAR_TRIPLE)
+        format!("myownmesh-{DAEMON_SIDECAR_TRIPLE}.exe")
     } else {
-        format!("myownmesh-{}", DAEMON_SIDECAR_TRIPLE)
+        format!("myownmesh-{DAEMON_SIDECAR_TRIPLE}")
     };
     let mut out: Vec<PathBuf> = Vec::new();
 
diff --git a/src-tauri/src/mesh/identity.rs b/src-tauri/src/mesh/identity.rs
index 7dea8e7..31dc1e7 100644
--- a/src-tauri/src/mesh/identity.rs
+++ b/src-tauri/src/mesh/identity.rs
@@ -17,5 +17,5 @@
 //! `myownmesh-core` and aren't duplicated here.
 
 pub use myownmesh_core::identity::{
-    generate_network_id, load_or_create, normalize_network_id, set_label, Identity,
+    generate_network_id, load_or_create, normalize_network_id, set_label,
 };
diff --git a/src-tauri/src/mesh/roster.rs b/src-tauri/src/mesh/roster.rs
index 7f29ba8..efa84d4 100644
--- a/src-tauri/src/mesh/roster.rs
+++ b/src-tauri/src/mesh/roster.rs
@@ -23,7 +23,7 @@
 //! over.
 
 pub use myownmesh_core::roster::{
-    add_peer, add_peer_in, delete, empty_for, is_authorized, load, remove_peer, remove_peer_in,
+    add_peer, delete, load, remove_peer,
     save, AuthorizedPeer, Roster, ROSTER_VERSION,
 };
 
diff --git a/src-tauri/src/mesh/signing.rs b/src-tauri/src/mesh/signing.rs
index 36066ec..6aaa009 100644
--- a/src-tauri/src/mesh/signing.rs
+++ b/src-tauri/src/mesh/signing.rs
@@ -8,4 +8,4 @@
 //! which reads `MYOWNMESH_HOME` — set to `~/.myownllm` in `main.rs`
 //! — so the local Device ID is unchanged.
 
-pub use myownmesh_core::signing::{pubkey_part, sign, verify};
+pub use myownmesh_core::signing::{sign, verify};
diff --git a/src-tauri/src/transcribe.rs b/src-tauri/src/transcribe.rs
index 4dba775..93a8f65 100644
--- a/src-tauri/src/transcribe.rs
+++ b/src-tauri/src/transcribe.rs
@@ -1409,8 +1409,7 @@ fn run_session(
                         count_pending_chunks(&buffer_dir),
                         None,
                         Some(format!(
-                            "ASR inference error ({}/{}): {e:#}",
-                            consecutive_errors, ASR_CONSECUTIVE_ERROR_LIMIT
+                            "ASR inference error ({consecutive_errors}/{ASR_CONSECUTIVE_ERROR_LIMIT}): {e:#}"
                         )),
                     ),
                 );
@@ -2128,8 +2127,7 @@ fn ingest_loop(
                         count_pending_chunks(&buffer_dir),
                         None,
                         Some(format!(
-                            "Backlog full ({:.0} s); dropping oldest chunk to stay live.",
-                            MAX_BACKLOG_SECONDS
+                            "Backlog full ({MAX_BACKLOG_SECONDS:.0} s); dropping oldest chunk to stay live."
                         )),
                     ),
                 );

From 1f3e7470a87631ac9e403fc40c9247b7e61bb6b7 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Fri, 29 May 2026 05:25:38 +0000
Subject: [PATCH 06/12] Soften throttle to IO-only, quiet daemon search, fix
 platform warnings
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

macOS inference was crippled because the throttle used taskpolicy -b
(background QoS), which demotes the whole process to efficiency cores
and throttles compute, not just disk. Switch to IO-only throttling so
the machine stays responsive during a load while token generation runs
at full speed:
- macOS: taskpolicy -d throttle (disk IO policy only; CPU/QoS untouched).
- Linux: ionice best-effort low (-c2 -n7) instead of idle, and drop the
  renice so inference keeps full CPU.
- Windows: unchanged (BelowNormal is a mild priority nudge, not a
  compute throttle).

Daemon binary search no longer logs a 'skipping ...' line for every
probed-but-inapplicable location on the happy path. Reasons are now
collected and printed only when the search actually fails (no usable
binary, or every candidate fails to spawn).

Clean up the warnings surfaced by a Windows  build (verified
via x86_64-pc-windows-gnu cross-check):
- usage.rs: drop unused std::ffi::c_void import.
- process.rs: drop redundant CommandExt import (tokio Command has an
  inherent creation_flags).
- ollama.rs: allow(unreachable_code) on install() — the tail Ok(()) is
  the Linux/unsupported fallback, unreachable on macOS/Windows by design.
- hardware.rs: cfg-gate the Linux-only parsers' dead_code allowance.
---
 src-tauri/src/hardware.rs    |  4 ++
 src-tauri/src/mesh/daemon.rs | 93 +++++++++++++++++++++++++-----------
 src-tauri/src/ollama.rs      |  4 ++
 src-tauri/src/process.rs     | 48 +++++++++----------
 src-tauri/src/usage.rs       |  1 -
 5 files changed, 98 insertions(+), 52 deletions(-)

diff --git a/src-tauri/src/hardware.rs b/src-tauri/src/hardware.rs
index 90ff2ad..e471262 100644
--- a/src-tauri/src/hardware.rs
+++ b/src-tauri/src/hardware.rs
@@ -177,6 +177,7 @@ fn read_proc_meminfo_total_gb() -> Option<f64> {
 }
 
 /// Pulled out for testability. Reads the `MemTotal: NNN kB` line.
+#[cfg_attr(not(target_os = "linux"), allow(dead_code))] // Linux detect() + tests only
 fn parse_meminfo_total_kb(content: &str) -> Option<u64> {
     for line in content.lines() {
         if let Some(rest) = line.strip_prefix("MemTotal:") {
@@ -262,6 +263,7 @@ fn df_k_gb(path: &str) -> Option<f64> {
 /// rows; field 4 is `Available` in 1K blocks. Some `df` flavours wrap long
 /// device names onto a second line, so the available column may be on the
 /// row after the device name. Find the first row with a numeric column 4.
+#[cfg_attr(not(target_os = "linux"), allow(dead_code))] // Linux detect() + tests only
 fn parse_df_avail_kb(out: &str) -> Option<u64> {
     for line in out.lines().skip(1) {
         let parts: Vec<&str> = line.split_whitespace().collect();
@@ -303,6 +305,7 @@ fn detect_soc_label() -> Option<String> {
     None
 }
 
+#[cfg_attr(not(target_os = "linux"), allow(dead_code))] // Linux detect() + tests only
 fn parse_device_tree_model(raw: &[u8]) -> Option<String> {
     // Trim trailing NUL bytes the kernel attaches to the device-tree string.
     let end = raw.iter().position(|b| *b == 0).unwrap_or(raw.len());
@@ -313,6 +316,7 @@ fn parse_device_tree_model(raw: &[u8]) -> Option<String> {
     Some(s.to_string())
 }
 
+#[cfg_attr(not(target_os = "linux"), allow(dead_code))] // Linux detect() + tests only
 fn parse_cpuinfo_model(content: &str) -> Option<String> {
     // ARM kernels emit `Model : Raspberry Pi 5 Model B Rev 1.0` and/or
     // `Hardware : BCM2712`. Prefer the human-friendly Model line.
diff --git a/src-tauri/src/mesh/daemon.rs b/src-tauri/src/mesh/daemon.rs
index ab4d3b0..53fc88a 100644
--- a/src-tauri/src/mesh/daemon.rs
+++ b/src-tauri/src/mesh/daemon.rs
@@ -532,25 +532,28 @@ impl Drop for DaemonChild {
 /// dev cycle rewrites with fresh content instead of perpetually
 /// staging corrupt bits. Files at paths we DON'T own (PATH
 /// lookup, env-var override) just get skipped without deletion.
-fn looks_like_executable(path: &Path) -> bool {
+/// Validate a candidate daemon binary. `Ok(())` when usable, else
+/// `Err(reason)` describing why it was rejected. Performs the self-heal
+/// removal of a stale *owned* slot, but does NOT log — callers collect
+/// the reasons and surface them only when the whole search fails, so a
+/// successful daemon launch stays quiet (see `ensure_daemon_running`).
+fn check_executable(path: &Path) -> Result<(), String> {
     match validate_path_is_executable(path) {
-        Ok(()) => true,
+        Ok(()) => Ok(()),
         Err(reason) => {
-            // Self-heal: if this is a known-owned slot and the
-            // content is invalid, delete it. Tauri's externalBin
-            // staging will regenerate from the source on the
-            // next build; the source's own validator catches
-            // problems before propagating them.
+            // Self-heal: if this is a known-owned slot and the content is
+            // invalid, delete it. Tauri's externalBin staging regenerates
+            // it from the source on the next build; the source's own
+            // validator catches problems before propagating them.
             if is_owned_slot(path) {
-                eprintln!(
-                    "daemon: {} failed executable check ({reason}); removing stale file",
-                    path.display()
-                );
                 let _ = std::fs::remove_file(path);
+                Err(format!(
+                    "{} failed executable check ({reason}); removed stale file",
+                    path.display()
+                ))
             } else {
-                eprintln!("daemon: skipping {} ({reason})", path.display());
+                Err(format!("skipping {} ({reason})", path.display()))
             }
-            false
         }
     }
 }
@@ -623,6 +626,14 @@ fn is_owned_slot(path: &Path) -> bool {
 }
 
 pub fn daemon_binary_candidates() -> Vec<PathBuf> {
+    daemon_binary_candidates_diag().0
+}
+
+/// Like [`daemon_binary_candidates`] but also returns the human-readable
+/// reason each rejected path was skipped. `ensure_daemon_running` holds
+/// these and only prints them if the whole search fails, so a successful
+/// launch doesn't spam the log with every probed-and-skipped location.
+fn daemon_binary_candidates_diag() -> (Vec<PathBuf>, Vec<String>) {
     let exe = if cfg!(windows) {
         "myownmesh.exe"
     } else {
@@ -638,18 +649,20 @@ pub fn daemon_binary_candidates() -> Vec<PathBuf> {
         format!("myownmesh-{DAEMON_SIDECAR_TRIPLE}")
     };
     let mut out: Vec<PathBuf> = Vec::new();
+    let mut diags: Vec<String> = Vec::new();
 
     // Helper: push a candidate iff it exists AND looks like a
     // real executable (filters out the zero-byte stub
     // `build.rs` writes when the daemon fetch was skipped, AND
     // filters out corrupt / truncated downloads that would
     // otherwise produce a confusing "%1 is not a valid Win32
-    // application" error when we try to spawn them).
-    fn push_if_usable(out: &mut Vec<PathBuf>, p: PathBuf) {
-        if !looks_like_executable(&p) {
-            return;
+    // application" error when we try to spawn them). Rejection
+    // reasons are collected into `diags` rather than logged here.
+    fn push_if_usable(out: &mut Vec<PathBuf>, diags: &mut Vec<String>, p: PathBuf) {
+        match check_executable(&p) {
+            Ok(()) => out.push(p),
+            Err(reason) => diags.push(reason),
         }
-        out.push(p);
     }
 
     // 1. Bundled sidecar next to the running LLM executable —
@@ -659,8 +672,8 @@ pub fn daemon_binary_candidates() -> Vec<PathBuf> {
     //    via the same code path.
     if let Ok(exe_path) = std::env::current_exe() {
         if let Some(exe_dir) = exe_path.parent() {
-            push_if_usable(&mut out, exe_dir.join(exe));
-            push_if_usable(&mut out, exe_dir.join(&exe_with_triple));
+            push_if_usable(&mut out, &mut diags, exe_dir.join(exe));
+            push_if_usable(&mut out, &mut diags, exe_dir.join(&exe_with_triple));
         }
     }
 
@@ -670,7 +683,11 @@ pub fn daemon_binary_candidates() -> Vec<PathBuf> {
     //     the *only* place the binary lives. Relative to the
     //     crate, so it works from any working directory.
     let manifest = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
-    push_if_usable(&mut out, manifest.join("binaries").join(&exe_with_triple));
+    push_if_usable(
+        &mut out,
+        &mut diags,
+        manifest.join("binaries").join(&exe_with_triple),
+    );
 
     // 2 + 3. Explicit env-var overrides.
     for var in ["MYOWNLLM_MESH_BIN", "MYOWNMESH_BIN"] {
@@ -696,12 +713,21 @@ pub fn daemon_binary_candidates() -> Vec<PathBuf> {
     //    `manifest` already declared above for the `binaries/`
     //    lookup; reuse it here.
     for profile in ["debug", "release"] {
-        push_if_usable(&mut out, manifest.join("target").join(profile).join(exe));
+        push_if_usable(
+            &mut out,
+            &mut diags,
+            manifest.join("target").join(profile).join(exe),
+        );
         if let Some(parent) = manifest.parent() {
-            push_if_usable(&mut out, parent.join("target").join(profile).join(exe));
+            push_if_usable(
+                &mut out,
+                &mut diags,
+                parent.join("target").join(profile).join(exe),
+            );
             if let Some(grandparent) = parent.parent() {
                 push_if_usable(
                     &mut out,
+                    &mut diags,
                     grandparent
                         .join("MyOwnMesh")
                         .join("target")
@@ -719,7 +745,7 @@ pub fn daemon_binary_candidates() -> Vec<PathBuf> {
         let canonical = std::fs::canonicalize(p).unwrap_or_else(|_| p.clone());
         seen.insert(canonical)
     });
-    out
+    (out, diags)
 }
 
 /// Legacy single-binary lookup. Returns the highest-priority
@@ -786,8 +812,15 @@ pub async fn ensure_daemon_running() -> Result<(ControlClient, Option<DaemonChil
     //    skipped in favour of a working binary at the next. The
     //    diag log surfaces each attempt so a user with multiple
     //    stale candidates can see which one we picked.
-    let candidates = daemon_binary_candidates();
+    // `skip_diags` records every probed-but-rejected location. We stay
+    // quiet about them on the happy path and only surface them if the
+    // search ultimately fails, so a normal launch doesn't log a wall of
+    // "skipping ..." lines for paths that simply don't apply here.
+    let (candidates, skip_diags) = daemon_binary_candidates_diag();
     if candidates.is_empty() {
+        for d in &skip_diags {
+            eprintln!("daemon: {d}");
+        }
         return Err(anyhow!(
             "couldn't find a `myownmesh` binary. Re-run the LLM build with network \
              access so `build.rs` can fetch the daemon, set MYOWNLLM_MESH_BIN to a \
@@ -870,8 +903,14 @@ pub async fn ensure_daemon_running() -> Result<(ControlClient, Option<DaemonChil
         drop(handle);
     }
 
-    // Every candidate failed. Surface a diag listing all of them
-    // so the user can see what got tried — beats "couldn't find".
+    // Every candidate failed. Now that the search has definitively
+    // failed, surface the locations we skipped during enumeration too —
+    // they're part of the picture the user needs to debug it.
+    for d in &skip_diags {
+        eprintln!("daemon: {d}");
+    }
+    // Surface a diag listing everything tried so the user can see what
+    // got attempted — beats a bare "couldn't find".
     let tried: Vec<String> = candidates.iter().map(|p| p.display().to_string()).collect();
     Err(anyhow!(
         "no working `myownmesh` binary on this machine. Tried:\n  {}\nLast error: {}",
diff --git a/src-tauri/src/ollama.rs b/src-tauri/src/ollama.rs
index bcc8f3a..329f2ca 100644
--- a/src-tauri/src/ollama.rs
+++ b/src-tauri/src/ollama.rs
@@ -96,6 +96,10 @@ fn ensure_macos_default_on_path() -> bool {
     false
 }
 
+// The trailing `Ok(())` is the Linux fall-through / unsupported-platform
+// fallback; on macOS and Windows the cfg blocks above always return, so
+// it's unreachable there by design — silence the per-platform lint.
+#[allow(unreachable_code)]
 pub async fn install() -> Result<()> {
     #[cfg(target_os = "linux")]
     {
diff --git a/src-tauri/src/process.rs b/src-tauri/src/process.rs
index 8e5dddb..c912a93 100644
--- a/src-tauri/src/process.rs
+++ b/src-tauri/src/process.rs
@@ -41,53 +41,53 @@ fn apply_quiet_flags(_cmd: &mut std::process::Command) {}
 
 #[cfg(target_os = "windows")]
 fn apply_quiet_flags_tokio(cmd: &mut tokio::process::Command) {
-    use std::os::windows::process::CommandExt;
+    // tokio's Command exposes `creation_flags` inherently — no CommandExt
+    // trait import needed (unlike the std::process variant above).
     cmd.creation_flags(CREATE_NO_WINDOW);
 }
 #[cfg(not(target_os = "windows"))]
 fn apply_quiet_flags_tokio(_cmd: &mut tokio::process::Command) {}
 
-/// Best-effort: drop a child process's scheduling priority so heavy,
-/// bursty work — notably an LLM server paging multi-GB weights in from
-/// disk on first use — doesn't starve the desktop and freeze the whole
-/// machine. We lower **IO** priority where the platform exposes it,
-/// since model loading is disk-bound and that's the real lever, plus a
-/// gentle CPU nice. Every call is fire-and-forget: a missing tool or a
-/// permission error just means no throttle, never a hard failure.
+/// Best-effort: ease a child process's **disk IO** priority so the heavy
+/// reads when an LLM server pages multi-GB weights in on first use don't
+/// starve the desktop — without throttling the CPU/GPU work, so token
+/// generation stays full speed once the model is resident. Model loading
+/// is disk-bound and inference is compute-bound, so targeting IO alone is
+/// the right lever: the machine stays responsive during the load but the
+/// model isn't kneecapped. Every call is fire-and-forget: a missing tool
+/// or permission error just means no throttle, never a hard failure.
 #[allow(unused_variables)] // `pid` is unused on platforms without a branch
 pub async fn lower_priority(pid: u32) {
     let pid = pid.to_string();
     #[cfg(target_os = "linux")]
     {
-        // ionice class 3 = "idle": the process only gets disk time when
-        // nothing else wants it. This is what keeps the UI painting
-        // (and our load dialog visible) while the model streams in.
+        // Best-effort IO class, lowest priority (7): the process still
+        // gets disk time but yields to everything else under contention.
+        // IO-only — we deliberately don't renice, so inference keeps full
+        // CPU once loaded. (Idle class 3 would make loads crawl under any
+        // disk activity; this is the gentler in-between.)
         let _ = quiet_tokio_command("ionice")
-            .args(["-c", "3", "-p", &pid])
-            .status()
-            .await;
-        // A small CPU nudge — not a full demotion — so inference still
-        // feels snappy once the model is resident.
-        let _ = quiet_tokio_command("renice")
-            .args(["-n", "5", "-p", &pid])
+            .args(["-c", "2", "-n", "7", "-p", &pid])
             .status()
             .await;
     }
     #[cfg(target_os = "macos")]
     {
-        // taskpolicy -b moves the process into the background QoS tier,
-        // throttling both CPU and disk IO — macOS's closest equivalent
-        // to Linux's ionice idle class.
+        // Set ONLY the disk IO policy to "throttle" (IOPOL_THROTTLE) —
+        // leaves CPU scheduling and QoS untouched so inference runs on
+        // the performance cores at full speed. The earlier `-b`
+        // (background QoS) demoted the whole process to efficiency cores
+        // and throttled compute, which crippled token generation.
         let _ = quiet_tokio_command("taskpolicy")
-            .args(["-b", "-p", &pid])
+            .args(["-d", "throttle", "-p", &pid])
             .status()
             .await;
     }
     #[cfg(target_os = "windows")]
     {
         // Windows doesn't expose per-process IO priority to other
-        // processes without FFI; dropping the priority class to
-        // BelowNormal still de-prioritizes the load against the UI.
+        // processes without FFI; BelowNormal is a mild priority-class
+        // nudge against the UI, not a compute throttle.
         let script = format!("(Get-Process -Id {pid}).PriorityClass='BelowNormal'");
         let _ = quiet_tokio_command("powershell")
             .args(["-NoProfile", "-NonInteractive", "-Command", &script])
diff --git a/src-tauri/src/usage.rs b/src-tauri/src/usage.rs
index ef7df08..ba93fa5 100644
--- a/src-tauri/src/usage.rs
+++ b/src-tauri/src/usage.rs
@@ -613,7 +613,6 @@ fn win_process_cpu_seconds() -> Option<f64> {
 #[cfg(target_os = "windows")]
 fn win_total_cpu_times() -> Option<(u64, u64)> {
     // (idle ticks, total ticks). Returns 100ns ticks summed across cores.
-    use std::ffi::c_void;
     type Bool = i32;
     #[repr(C)]
     struct Filetime {

From 3d0ecddea50d388c1bf62efb648eb4b3791ace9d Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Fri, 29 May 2026 05:32:22 +0000
Subject: [PATCH 07/12] Add Performance settings tab with a load-throttle
 setting

Promote performance settings out of the Hardware tab into their own
Performance tab (listed right after Hardware), and make the load
throttle user-tunable:

- New ollama_throttle config (off | io | aggressive), default io.
  - off: no throttle (fastest load, can bog the machine down).
  - io: ease disk IO priority only; inference stays full speed (default).
  - aggressive: also demote CPU/QoS; most responsive desktop, slower
    inference.
- lower_priority() now takes the mode and branches per platform; the
  Ollama spawn reads the config and skips throttling entirely when off.
- New PerformanceSection.svelte hosts both the keep-model-loaded
  (keep_alive) and load-throttle settings; removed the inline
  Performance group from HardwareSection.
---
 src-tauri/src/ollama.rs                   |  29 ++-
 src-tauri/src/process.rs                  |  89 ++++++----
 src-tauri/src/resolver.rs                 |   1 +
 src/config.ts                             |   1 +
 src/types.ts                              |   7 +
 src/ui/SettingsPanel.svelte               |   5 +
 src/ui/settings/HardwareSection.svelte    |  48 -----
 src/ui/settings/PerformanceSection.svelte | 207 ++++++++++++++++++++++
 8 files changed, 304 insertions(+), 83 deletions(-)
 create mode 100644 src/ui/settings/PerformanceSection.svelte

diff --git a/src-tauri/src/ollama.rs b/src-tauri/src/ollama.rs
index 329f2ca..17093b1 100644
--- a/src-tauri/src/ollama.rs
+++ b/src-tauri/src/ollama.rs
@@ -300,11 +300,15 @@ pub async fn ensure_running() -> Result<()> {
         .context("failed to spawn ollama serve")?;
 
     // Throttle the server we just spawned so the disk thrash of loading
-    // a model doesn't lock up the whole desktop. Best-effort, and only
-    // possible because WE own this process — when Ollama is already
+    // a model doesn't lock up the whole desktop. The mode is user-tunable
+    // (Settings → Performance); "off" skips it entirely. Best-effort, and
+    // only possible because WE own this process — when Ollama is already
     // running as a system/tray service we never reach this branch.
-    if let Some(pid) = child.id() {
-        crate::process::lower_priority(pid).await;
+    let mode = throttle_mode();
+    if mode != "off" {
+        if let Some(pid) = child.id() {
+            crate::process::lower_priority(pid, &mode).await;
+        }
     }
 
     *guard = Some(child);
@@ -796,6 +800,23 @@ fn chat_keep_alive() -> serde_json::Value {
         .unwrap_or_else(|| serde_json::json!("30m"))
 }
 
+/// Resolve the user's configured throttle mode for the Ollama server we
+/// spawn — how hard we ease its priority so model loading doesn't starve
+/// the desktop: "off" (no throttle), "io" (disk-IO only; keeps inference
+/// full speed — the default), or "aggressive" (also demote CPU/QoS; most
+/// responsive machine but slower inference). Falls back to "io".
+fn throttle_mode() -> String {
+    crate::resolver::load_config_value()
+        .ok()
+        .and_then(|c| {
+            c.get("ollama_throttle")
+                .and_then(|v| v.as_str())
+                .map(str::to_string)
+        })
+        .filter(|m| matches!(m.as_str(), "off" | "io" | "aggressive"))
+        .unwrap_or_else(|| "io".to_string())
+}
+
 /// Streamed chat completion. Invokes `on_content` for each visible token
 /// chunk, `on_thinking` for any reasoning/thinking deltas (thinking models
 /// emit those in `message.thinking`; non-thinking models never call it),
diff --git a/src-tauri/src/process.rs b/src-tauri/src/process.rs
index c912a93..e2db2ae 100644
--- a/src-tauri/src/process.rs
+++ b/src-tauri/src/process.rs
@@ -48,47 +48,74 @@ fn apply_quiet_flags_tokio(cmd: &mut tokio::process::Command) {
 #[cfg(not(target_os = "windows"))]
 fn apply_quiet_flags_tokio(_cmd: &mut tokio::process::Command) {}
 
-/// Best-effort: ease a child process's **disk IO** priority so the heavy
-/// reads when an LLM server pages multi-GB weights in on first use don't
-/// starve the desktop — without throttling the CPU/GPU work, so token
-/// generation stays full speed once the model is resident. Model loading
-/// is disk-bound and inference is compute-bound, so targeting IO alone is
-/// the right lever: the machine stays responsive during the load but the
-/// model isn't kneecapped. Every call is fire-and-forget: a missing tool
-/// or permission error just means no throttle, never a hard failure.
-#[allow(unused_variables)] // `pid` is unused on platforms without a branch
-pub async fn lower_priority(pid: u32) {
+/// Best-effort: ease a child process's priority so the heavy reads when
+/// an LLM server pages multi-GB weights in on first use don't starve the
+/// desktop. `mode` controls how hard:
+///
+/// - `"io"` (default): lower **disk IO** priority only. Model loading is
+///   disk-bound and inference is compute-bound, so this keeps the machine
+///   responsive during a load without kneecapping token generation.
+/// - `"aggressive"`: also demote CPU/QoS. Most responsive desktop during
+///   a load, but inference itself runs slower.
+///
+/// (`"off"` is handled by the caller, which simply doesn't call this.)
+/// Every call is fire-and-forget: a missing tool or permission error just
+/// means no throttle, never a hard failure.
+#[allow(unused_variables)] // `pid`/`mode` are unused on platforms without a branch
+pub async fn lower_priority(pid: u32, mode: &str) {
     let pid = pid.to_string();
+    let aggressive = mode == "aggressive";
     #[cfg(target_os = "linux")]
     {
-        // Best-effort IO class, lowest priority (7): the process still
-        // gets disk time but yields to everything else under contention.
-        // IO-only — we deliberately don't renice, so inference keeps full
-        // CPU once loaded. (Idle class 3 would make loads crawl under any
-        // disk activity; this is the gentler in-between.)
-        let _ = quiet_tokio_command("ionice")
-            .args(["-c", "2", "-n", "7", "-p", &pid])
-            .status()
-            .await;
+        if aggressive {
+            // Idle IO class (only runs when nothing else wants the disk)
+            // plus a CPU nice — maximum desktop responsiveness, slower
+            // load and inference.
+            let _ = quiet_tokio_command("ionice")
+                .args(["-c", "3", "-p", &pid])
+                .status()
+                .await;
+            let _ = quiet_tokio_command("renice")
+                .args(["-n", "5", "-p", &pid])
+                .status()
+                .await;
+        } else {
+            // Best-effort IO class, lowest priority (7): still gets disk
+            // time but yields under contention. IO-only — no renice, so
+            // inference keeps full CPU once loaded.
+            let _ = quiet_tokio_command("ionice")
+                .args(["-c", "2", "-n", "7", "-p", &pid])
+                .status()
+                .await;
+        }
     }
     #[cfg(target_os = "macos")]
     {
-        // Set ONLY the disk IO policy to "throttle" (IOPOL_THROTTLE) —
-        // leaves CPU scheduling and QoS untouched so inference runs on
-        // the performance cores at full speed. The earlier `-b`
-        // (background QoS) demoted the whole process to efficiency cores
-        // and throttled compute, which crippled token generation.
-        let _ = quiet_tokio_command("taskpolicy")
-            .args(["-d", "throttle", "-p", &pid])
-            .status()
-            .await;
+        if aggressive {
+            // Background QoS: demotes to efficiency cores and throttles
+            // both compute and IO. Frees the machine most, but slows
+            // inference noticeably.
+            let _ = quiet_tokio_command("taskpolicy")
+                .args(["-b", "-p", &pid])
+                .status()
+                .await;
+        } else {
+            // Disk IO policy "throttle" (IOPOL_THROTTLE) only — leaves CPU
+            // scheduling and QoS untouched, so inference runs on the
+            // performance cores at full speed.
+            let _ = quiet_tokio_command("taskpolicy")
+                .args(["-d", "throttle", "-p", &pid])
+                .status()
+                .await;
+        }
     }
     #[cfg(target_os = "windows")]
     {
         // Windows doesn't expose per-process IO priority to other
-        // processes without FFI; BelowNormal is a mild priority-class
-        // nudge against the UI, not a compute throttle.
-        let script = format!("(Get-Process -Id {pid}).PriorityClass='BelowNormal'");
+        // processes without FFI; we nudge the priority class instead —
+        // BelowNormal for the IO tier, Idle (lowest) when aggressive.
+        let class = if aggressive { "Idle" } else { "BelowNormal" };
+        let script = format!("(Get-Process -Id {pid}).PriorityClass='{class}'");
         let _ = quiet_tokio_command("powershell")
             .args(["-NoProfile", "-NonInteractive", "-Command", &script])
             .status()
diff --git a/src-tauri/src/resolver.rs b/src-tauri/src/resolver.rs
index ee89bec..95fbb7e 100644
--- a/src-tauri/src/resolver.rs
+++ b/src-tauri/src/resolver.rs
@@ -784,6 +784,7 @@ pub fn default_config_value() -> Value {
         "active_mode": "text",
         "model_cleanup_days": 1,
         "ollama_keep_alive": "30m",
+        "ollama_throttle": "io",
         "kept_models": [],
         "mode_overrides": {},
         "tracked_modes": ["text"],
diff --git a/src/config.ts b/src/config.ts
index 5d2a3d0..ade2a2f 100644
--- a/src/config.ts
+++ b/src/config.ts
@@ -112,6 +112,7 @@ const DEFAULT_CONFIG: Config = {
   active_mode: "transcribe",
   model_cleanup_days: 1,
   ollama_keep_alive: "30m",
+  ollama_throttle: "io",
   cleanup_warning_suppressed_families: [],
   kept_models: [],
   mode_overrides: {},
diff --git a/src/types.ts b/src/types.ts
index 98241e5..7f064ca 100644
--- a/src/types.ts
+++ b/src/types.ts
@@ -497,6 +497,13 @@ export interface Config {
    *  Longer values avoid cold-start reloads between messages; shorter
    *  values suit memory-tight machines. Default "30m". */
   ollama_keep_alive: string;
+  /** How hard to throttle the Ollama server we spawn while it loads a
+   *  model, so the disk thrash doesn't freeze the machine. "off" = no
+   *  throttle; "io" = disk-IO priority only (keeps inference full speed,
+   *  the default); "aggressive" = also demote CPU/QoS (most responsive
+   *  desktop, slower inference). Only applies when MyOwnLLM spawns Ollama
+   *  itself — not when it's a system/tray service. */
+  ollama_throttle: "off" | "io" | "aggressive";
   /** Family names for which the user has dismissed the
    *  "switching with auto-cleanup on" confirmation in the family
    *  detail view's per-tier picker. Per-family rather than per-tier
diff --git a/src/ui/SettingsPanel.svelte b/src/ui/SettingsPanel.svelte
index 05436c3..4748e49 100644
--- a/src/ui/SettingsPanel.svelte
+++ b/src/ui/SettingsPanel.svelte
@@ -3,6 +3,7 @@
   import ModelsSection from "./settings/ModelsSection.svelte";
   import StorageSection from "./settings/StorageSection.svelte";
   import HardwareSection from "./settings/HardwareSection.svelte";
+  import PerformanceSection from "./settings/PerformanceSection.svelte";
   import UsageSection from "./settings/UsageSection.svelte";
   import UpdatesSection from "./settings/UpdatesSection.svelte";
   import CloudMeshSection from "./settings/CloudMeshSection.svelte";
@@ -18,6 +19,7 @@
     | "prompts"
     | "permissions"
     | "hardware"
+    | "performance"
     | "storage"
     | "usage"
     | "cloud-mesh"
@@ -81,6 +83,7 @@
     { id: "prompts", label: "Prompts" },
     { id: "permissions", label: "Permissions" },
     { id: "hardware", label: "Hardware" },
+    { id: "performance", label: "Performance" },
     { id: "storage", label: "Storage" },
     { id: "usage", label: "Usage" },
     { id: "updates", label: "Updates" },
@@ -151,6 +154,8 @@
         <StorageSection setActive={(t) => (active = t)} />
       {:else if active === "hardware"}
         <HardwareSection setActive={(t) => (active = t)} />
+      {:else if active === "performance"}
+        <PerformanceSection />
       {:else if active === "usage"}
         <UsageSection />
       {:else if active === "cloud-mesh"}
diff --git a/src/ui/settings/HardwareSection.svelte b/src/ui/settings/HardwareSection.svelte
index 9133c7e..8cf37c5 100644
--- a/src/ui/settings/HardwareSection.svelte
+++ b/src/ui/settings/HardwareSection.svelte
@@ -15,23 +15,6 @@
   let conversationDir = $state("");
   let loading = $state(true);
   let error = $state("");
-  /** Ollama `keep_alive` for chat — how long the model stays resident
-   *  in memory after a turn. Longer avoids cold-start reloads between
-   *  messages; shorter frees RAM/VRAM for transcription on tight
-   *  machines. Stored in Ollama's native duration format. */
-  let keepAlive = $state("30m");
-  const KEEP_ALIVE_OPTIONS: { value: string; label: string }[] = [
-    { value: "0", label: "Unload immediately (lowest memory)" },
-    { value: "5m", label: "5 minutes (Ollama default)" },
-    { value: "30m", label: "30 minutes (recommended)" },
-    { value: "1h", label: "1 hour" },
-    { value: "-1", label: "Until the app quits (keep resident)" },
-  ];
-
-  async function patchKeepAlive(value: string) {
-    keepAlive = value;
-    await updateConfig({ ollama_keep_alive: value });
-  }
   /** Tag the resolver picks for transcribe against the active family +
    *  hardware. Resolved here (not just described) so users can confirm
    *  the active whisper model from this tab without bouncing to Models. */
@@ -75,7 +58,6 @@
       ]);
       hardware = hw;
       conversationDir = config.conversation_dir ?? "";
-      keepAlive = config.ollama_keep_alive ?? "30m";
       mic = { ...config.mic };
       micDevices = devices;
       if (manifest) {
@@ -267,36 +249,6 @@
         </dl>
       </div>
 
-      <div class="group-label">Performance</div>
-
-      <div class="card">
-        <div class="card-title">Model memory</div>
-        <p class="card-meta">
-          How long the chat model stays loaded in memory after a reply.
-          Longer keeps later messages instant; shorter frees RAM/VRAM
-          sooner — handy when transcription needs to run alongside on a
-          memory-tight machine.
-        </p>
-        <dl class="info">
-          <div class="full">
-            <dt>Keep model loaded for</dt>
-            <dd>
-              <select
-                value={keepAlive}
-                onchange={(e) => patchKeepAlive((e.currentTarget as HTMLSelectElement).value)}
-              >
-                {#if !KEEP_ALIVE_OPTIONS.some((o) => o.value === keepAlive)}
-                  <option value={keepAlive}>Custom: {keepAlive}</option>
-                {/if}
-                {#each KEEP_ALIVE_OPTIONS as opt (opt.value)}
-                  <option value={opt.value}>{opt.label}</option>
-                {/each}
-              </select>
-            </dd>
-          </div>
-        </dl>
-      </div>
-
       <div class="group-label">Storage</div>
 
       <div class="card">
diff --git a/src/ui/settings/PerformanceSection.svelte b/src/ui/settings/PerformanceSection.svelte
new file mode 100644
index 0000000..95a4d94
--- /dev/null
+++ b/src/ui/settings/PerformanceSection.svelte
@@ -0,0 +1,207 @@
+<script lang="ts">
+  import { onMount } from "svelte";
+  import { loadConfig, updateConfig } from "../../config";
+  import { scrollAffordance } from "../scroll-affordance";
+
+  let loading = $state(true);
+  let error = $state("");
+
+  /** Ollama `keep_alive` for chat — how long the model stays resident in
+   *  memory after a turn. Longer avoids cold-start reloads between
+   *  messages; shorter frees RAM/VRAM. Ollama's native duration format. */
+  let keepAlive = $state("30m");
+  const KEEP_ALIVE_OPTIONS: { value: string; label: string }[] = [
+    { value: "0", label: "Unload immediately (lowest memory)" },
+    { value: "5m", label: "5 minutes (Ollama default)" },
+    { value: "30m", label: "30 minutes (recommended)" },
+    { value: "1h", label: "1 hour" },
+    { value: "-1", label: "Until the app quits (keep resident)" },
+  ];
+
+  /** How hard to throttle the Ollama server while it loads a model so the
+   *  disk thrash doesn't freeze the machine. */
+  type Throttle = "off" | "io" | "aggressive";
+  let throttle = $state<Throttle>("io");
+  const THROTTLE_OPTIONS: { value: Throttle; label: string; hint: string }[] = [
+    {
+      value: "off",
+      label: "Off (fastest load)",
+      hint: "No throttle. Loads fastest, but a big model can briefly bog down the whole machine.",
+    },
+    {
+      value: "io",
+      label: "Balanced — disk only (recommended)",
+      hint: "Eases disk priority during the load so the machine stays responsive, while inference keeps running at full speed.",
+    },
+    {
+      value: "aggressive",
+      label: "Aggressive (most responsive)",
+      hint: "Also lowers CPU priority. Keeps the desktop snappiest during a load, but token generation runs slower.",
+    },
+  ];
+
+  onMount(async () => {
+    try {
+      const config = await loadConfig();
+      keepAlive = config.ollama_keep_alive ?? "30m";
+      throttle = (config.ollama_throttle ?? "io") as Throttle;
+    } catch (e) {
+      error = String(e);
+    } finally {
+      loading = false;
+    }
+  });
+
+  async function patchKeepAlive(value: string) {
+    keepAlive = value;
+    await updateConfig({ ollama_keep_alive: value });
+  }
+
+  async function patchThrottle(value: Throttle) {
+    throttle = value;
+    await updateConfig({ ollama_throttle: value });
+  }
+
+  const throttleHint = $derived(
+    THROTTLE_OPTIONS.find((o) => o.value === throttle)?.hint ?? "",
+  );
+</script>
+
+<div class="section">
+  <div class="head">
+    <p class="lede">
+      Tune how MyOwnLLM trades <strong>responsiveness</strong> against
+      <strong>load and inference speed</strong> when running models locally.
+      These apply to the Ollama server MyOwnLLM launches itself.
+    </p>
+  </div>
+
+  {#if loading}
+    <p class="loading">Loading…</p>
+  {:else if error}
+    <p class="error">{error}</p>
+  {:else}
+    <div class="scroll-affordance-wrap">
+    <div class="cards scroll-fade" use:scrollAffordance>
+      <div class="group-label">Model memory</div>
+
+      <div class="card">
+        <div class="card-title">Keep model loaded</div>
+        <p class="card-meta">
+          How long the chat model stays in memory after a reply. Longer
+          keeps later messages instant; shorter frees RAM/VRAM sooner —
+          handy when transcription needs to run alongside on a
+          memory-tight machine.
+        </p>
+        <dl class="info">
+          <div class="full">
+            <dt>Keep model loaded for</dt>
+            <dd>
+              <select
+                value={keepAlive}
+                onchange={(e) => patchKeepAlive((e.currentTarget as HTMLSelectElement).value)}
+              >
+                {#if !KEEP_ALIVE_OPTIONS.some((o) => o.value === keepAlive)}
+                  <option value={keepAlive}>Custom: {keepAlive}</option>
+                {/if}
+                {#each KEEP_ALIVE_OPTIONS as opt (opt.value)}
+                  <option value={opt.value}>{opt.label}</option>
+                {/each}
+              </select>
+            </dd>
+          </div>
+        </dl>
+      </div>
+
+      <div class="group-label">Loading</div>
+
+      <div class="card">
+        <div class="card-title">Load throttle</div>
+        <p class="card-meta">
+          Loading a model reads gigabytes from disk, which can freeze a
+          laptop. This throttles those reads so the machine stays usable.
+          Loading is disk-bound and inference is compute-bound, so the
+          balanced default eases disk only and leaves token generation at
+          full speed.
+        </p>
+        <dl class="info">
+          <div class="full">
+            <dt>While a model loads</dt>
+            <dd>
+              <select
+                value={throttle}
+                onchange={(e) => patchThrottle((e.currentTarget as HTMLSelectElement).value as Throttle)}
+              >
+                {#each THROTTLE_OPTIONS as opt (opt.value)}
+                  <option value={opt.value}>{opt.label}</option>
+                {/each}
+              </select>
+            </dd>
+          </div>
+        </dl>
+        {#if throttleHint}
+          <p class="card-meta hint">{throttleHint}</p>
+        {/if}
+      </div>
+
+      <p class="footnote">
+        Throttling only applies when MyOwnLLM starts the Ollama server
+        itself. If Ollama is already running as a system or tray service,
+        these settings don't affect it.
+      </p>
+    </div>
+    <div class="scroll-more-hint" aria-hidden="true">
+      <span class="scroll-more-chevron">⌄</span>
+      <span>more below</span>
+    </div>
+    </div>
+  {/if}
+</div>
+
+<style>
+  .section { display: flex; flex-direction: column; height: 100%; min-height: 0; }
+  .head { padding: .75rem 1rem; border-bottom: 1px solid #1e1e1e; flex-shrink: 0; }
+  .lede { font-size: .78rem; color: #888; line-height: 1.5; }
+  .lede strong { color: #ccc; font-weight: 600; }
+
+  .loading, .error { padding: 2rem; text-align: center; color: #555; font-size: .82rem; }
+  .error { color: #d66; }
+
+  .cards { flex: 1; overflow-y: scroll; padding: .75rem; display: flex; flex-direction: column; gap: .6rem; min-height: 0; --scroll-fade-bg: #111; }
+  .group-label {
+    font-size: .68rem; color: #666; text-transform: uppercase;
+    letter-spacing: .06em; margin: .35rem .15rem -.1rem;
+  }
+  .group-label:first-child { margin-top: 0; }
+
+  .card {
+    border: 1px solid #1e1e1e;
+    background: #131318;
+    border-radius: 8px;
+    padding: .75rem .9rem;
+    display: flex; flex-direction: column; gap: .5rem;
+  }
+  .card-title { font-size: .9rem; font-weight: 600; color: #e8e8e8; }
+  .card-meta { font-size: .76rem; color: #888; line-height: 1.5; margin: 0; }
+  .card-meta.hint { color: #9a9ad6; }
+
+  .info { margin: 0; display: grid; grid-template-columns: repeat(auto-fit, minmax(140px, 1fr)); gap: .65rem; }
+  .info > div { display: flex; flex-direction: column; gap: .2rem; min-width: 0; }
+  .info > div.full { grid-column: 1 / -1; }
+  dt { font-size: .68rem; color: #666; text-transform: uppercase; letter-spacing: .03em; }
+  dd { margin: 0; font-size: .82rem; color: #ccc; display: flex; align-items: center; gap: .35rem; flex-wrap: wrap; }
+
+  select {
+    background: #0f0f12;
+    color: #e8e8e8;
+    border: 1px solid #2a2a2a;
+    border-radius: 6px;
+    padding: .3rem .4rem;
+    font-size: .8rem;
+    font-family: inherit;
+    max-width: 100%;
+  }
+  select:focus { outline: none; border-color: #6e6ef7; }
+
+  .footnote { font-size: .72rem; color: #555; line-height: 1.5; padding: .35rem .15rem 0; margin: 0; }
+</style>

From fc832815a8cb8a079ef17750004fa65013e958a9 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Fri, 29 May 2026 05:38:10 +0000
Subject: [PATCH 08/12] Fix rustfmt formatting to unblock CI

cargo fmt --check failed on two lines the earlier cargo fix / clippy
--fix pass left wrapped non-canonically (embedder.rs anyhow! call and
roster.rs re-export list). Reflow to rustfmt's canonical form. fmt
--check, clippy --all-targets, and cargo test all pass locally on the
pinned 1.88.0 toolchain.
---
 src-tauri/src/diarize/embedder.rs | 4 +---
 src-tauri/src/mesh/roster.rs      | 3 +--
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/src-tauri/src/diarize/embedder.rs b/src-tauri/src/diarize/embedder.rs
index 0479030..2e9b473 100644
--- a/src-tauri/src/diarize/embedder.rs
+++ b/src-tauri/src/diarize/embedder.rs
@@ -234,9 +234,7 @@ impl Embedder {
         // is correct for both shapes since the leading axes are 1.
         let shape = view.shape().to_vec();
         if shape.last().copied().unwrap_or(0) == 0 {
-            return Err(anyhow!(
-                "embedder produced zero-length output ({shape:?})"
-            ));
+            return Err(anyhow!("embedder produced zero-length output ({shape:?})"));
         }
         let mut out: Vec<f32> = view.iter().copied().collect();
         l2_normalize(&mut out);
diff --git a/src-tauri/src/mesh/roster.rs b/src-tauri/src/mesh/roster.rs
index efa84d4..44a60be 100644
--- a/src-tauri/src/mesh/roster.rs
+++ b/src-tauri/src/mesh/roster.rs
@@ -23,8 +23,7 @@
 //! over.
 
 pub use myownmesh_core::roster::{
-    add_peer, delete, load, remove_peer,
-    save, AuthorizedPeer, Roster, ROSTER_VERSION,
+    add_peer, delete, load, remove_peer, save, AuthorizedPeer, Roster, ROSTER_VERSION,
 };
 
 /// One-shot migration from the pre-multi-network single roster file

From a34467dcdeb8acd9a32751c45619483b95d393c8 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Fri, 29 May 2026 06:21:47 +0000
Subject: [PATCH 09/12] Throttle with a moderate nice; restore warm-on-startup
 default
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The io throttle was applied post-spawn via taskpolicy -p, which is a
no-op on macOS, so the server ran unthrottled and a load could starve
the display/networking and freeze the machine. And the previous fix
left the CPU fully open to the server (IO-only), which is what starved
the system in the first place.

Fix: throttle at launch with a moderate 0 so the server yields CPU
to the system (display, networking, WebView) when they need it, but
still gets the bulk of the cores when nothing competes — responsive
machine, inference not crippled. Applied as an argv prefix (nice execs
the target), which is also the only reliable way to set macOS IO policy.

- io (balanced, default): nice -n 10 (+ low best-effort ionice on Linux).
- aggressive: nice -n 19 + idle ionice (Linux) / background QoS (macOS).
- Windows: post-spawn priority class (BelowNormal / Idle).
- Fallback to a direct spawn if the wrapper can't bring Ollama up, so a
  missing/incompatible tool never disables the LLM.

Restore warm_on_startup to default ON (the load now runs under the
throttle, so it won't lock up the machine); it remains a toggle in
Settings → Performance.
---
 src-tauri/src/ollama.rs                   | 121 ++++++++++++++------
 src-tauri/src/process.rs                  | 129 ++++++++++++----------
 src-tauri/src/resolver.rs                 |   1 +
 src/config.ts                             |   1 +
 src/types.ts                              |   5 +
 src/ui/App.svelte                         |  20 ++--
 src/ui/settings/PerformanceSection.svelte |  45 +++++++-
 7 files changed, 219 insertions(+), 103 deletions(-)

diff --git a/src-tauri/src/ollama.rs b/src-tauri/src/ollama.rs
index 17093b1..c89bf65 100644
--- a/src-tauri/src/ollama.rs
+++ b/src-tauri/src/ollama.rs
@@ -278,51 +278,108 @@ pub async fn ensure_running() -> Result<()> {
         return Ok(());
     }
 
-    // OLLAMA_ORIGINS=* belt-and-suspenders: when WE spawn the server (e.g. Linux
-    // or a fresh standalone Windows install), this lets the GUI fetch directly
-    // from `http://127.0.0.1:11434` without Ollama's CORS allowlist rejecting
-    // the WebView's `Origin` (which on Tauri 2 / Windows is `http://tauri.localhost`,
-    // not in Ollama's defaults). When the Windows installer runs Ollama as a
-    // tray service we can't influence its env — that's why the GUI also routes
-    // chat through myownllm's API server (see Chat.svelte).
-    let child = quiet_tokio_command("ollama")
-        .arg("serve")
-        .env("OLLAMA_ORIGINS", "*")
-        // Cap memory pressure on the laptop-class machines that freeze
-        // hard while a model pages in: keep at most one model resident,
-        // and serve one request at a time, so Ollama never tries to
-        // hold two models (or N parallel KV caches) in RAM/VRAM at once.
-        .env("OLLAMA_MAX_LOADED_MODELS", "1")
-        .env("OLLAMA_NUM_PARALLEL", "1")
-        .stdout(Stdio::null())
-        .stderr(Stdio::null())
-        .spawn()
-        .context("failed to spawn ollama serve")?;
-
-    // Throttle the server we just spawned so the disk thrash of loading
-    // a model doesn't lock up the whole desktop. The mode is user-tunable
-    // (Settings → Performance); "off" skips it entirely. Best-effort, and
-    // only possible because WE own this process — when Ollama is already
-    // running as a system/tray service we never reach this branch.
+    // Throttle the server at launch (Settings → Performance). Applying it
+    // as an argv prefix is the reliable lever — notably on macOS, where
+    // taskpolicy's IO policy only takes effect when launching a program,
+    // not via `-p` on a running PID (that gap left the throttle a no-op
+    // and let a load thrash the machine). "off" yields no prefix.
     let mode = throttle_mode();
+    let prefix = crate::process::throttle_launch_prefix(&mode);
+
+    // Spawn under the wrapper when we have one. If the wrapper binary is
+    // missing the spawn itself errors — fall back to a plain spawn so a
+    // missing throttle tool can never leave the app without an LLM.
+    let child = match spawn_ollama_serve(prefix.as_deref()) {
+        Ok(c) => c,
+        Err(_) if prefix.is_some() => {
+            spawn_ollama_serve(None).context("failed to spawn ollama serve")?
+        }
+        Err(e) => return Err(anyhow::Error::new(e).context("failed to spawn ollama serve")),
+    };
+    *guard = Some(child);
+
+    // Windows throttles post-spawn (no reliable launch wrapper there);
+    // Unix already throttled via the argv prefix above.
+    #[cfg(target_os = "windows")]
     if mode != "off" {
-        if let Some(pid) = child.id() {
-            crate::process::lower_priority(pid, &mode).await;
+        if let Some(pid) = guard.as_ref().and_then(|c| c.id()) {
+            crate::process::set_priority_windows(pid, &mode).await;
         }
     }
 
-    *guard = Some(child);
-
-    // Wait up to 10 seconds for API to become reachable.
+    // Wait up to 10s for the API. Bail early if a launch-wrapper child
+    // exited without exec'ing ollama (e.g. an unsupported flag on this OS
+    // version): try_wait → Some means the wrapper failed, so we stop
+    // waiting and retry directly below rather than burning the full 10s.
+    let mut up = false;
     for _ in 0..20 {
         tokio::time::sleep(std::time::Duration::from_millis(500)).await;
         if api_reachable().await {
-            return Ok(());
+            up = true;
+            break;
+        }
+        if prefix.is_some() {
+            if let Some(c) = guard.as_mut() {
+                if matches!(c.try_wait(), Ok(Some(_))) {
+                    break; // wrapper died without bringing ollama up
+                }
+            }
+        }
+    }
+    if up {
+        return Ok(());
+    }
+
+    // Wrapped launch never came up — retry with a direct spawn so a
+    // broken/incompatible throttle tool can't disable the LLM entirely.
+    if prefix.is_some() {
+        if let Some(mut dead) = guard.take() {
+            let _ = dead.kill().await;
+        }
+        let direct = spawn_ollama_serve(None).context("failed to spawn ollama serve")?;
+        *guard = Some(direct);
+        for _ in 0..20 {
+            tokio::time::sleep(std::time::Duration::from_millis(500)).await;
+            if api_reachable().await {
+                return Ok(());
+            }
         }
     }
     Err(anyhow!("ollama serve did not become reachable within 10s"))
 }
 
+/// Spawn `ollama serve` with our standard env, optionally under a
+/// launch-time throttle wrapper (`prefix`, e.g. `["taskpolicy", "-d",
+/// "throttle"]`). The wrapper tools exec their target, so the child PID
+/// is ollama itself — kill-on-exit (`stop`) is unaffected.
+///
+/// OLLAMA_ORIGINS=* is belt-and-suspenders: when WE spawn the server this
+/// lets the GUI fetch directly from `http://127.0.0.1:11434` without
+/// Ollama's CORS allowlist rejecting the WebView's `Origin`. The memory
+/// caps keep at most one model resident and serve one request at a time,
+/// so Ollama never tries to hold two models (or N parallel KV caches) in
+/// RAM/VRAM at once — the swap thrash behind the hardest freezes.
+fn spawn_ollama_serve(prefix: Option<&[&str]>) -> std::io::Result<tokio::process::Child> {
+    let mut cmd = match prefix {
+        Some(p) if !p.is_empty() => {
+            let mut c = quiet_tokio_command(p[0]);
+            c.args(&p[1..]).arg("ollama").arg("serve");
+            c
+        }
+        _ => {
+            let mut c = quiet_tokio_command("ollama");
+            c.arg("serve");
+            c
+        }
+    };
+    cmd.env("OLLAMA_ORIGINS", "*")
+        .env("OLLAMA_MAX_LOADED_MODELS", "1")
+        .env("OLLAMA_NUM_PARALLEL", "1")
+        .stdout(Stdio::null())
+        .stderr(Stdio::null())
+        .spawn()
+}
+
 async fn api_reachable() -> bool {
     reqwest_get("http://127.0.0.1:11434/").await.is_ok()
 }
diff --git a/src-tauri/src/process.rs b/src-tauri/src/process.rs
index e2db2ae..7e84356 100644
--- a/src-tauri/src/process.rs
+++ b/src-tauri/src/process.rs
@@ -48,77 +48,86 @@ fn apply_quiet_flags_tokio(cmd: &mut tokio::process::Command) {
 #[cfg(not(target_os = "windows"))]
 fn apply_quiet_flags_tokio(_cmd: &mut tokio::process::Command) {}
 
-/// Best-effort: ease a child process's priority so the heavy reads when
-/// an LLM server pages multi-GB weights in on first use don't starve the
-/// desktop. `mode` controls how hard:
+/// Launch-time throttle wrapper for the LLM server, per the user's `mode`
+/// ("io" | "aggressive"; "off" → `None`). Returned as an argv prefix to
+/// prepend before `ollama serve` rather than something we apply to the
+/// PID after spawn — that distinction matters on macOS, where
+/// `taskpolicy`'s disk-IO policy only takes effect when *launching* a
+/// program, not via `-p` on a running one. Applying it post-spawn was a
+/// silent no-op, which left the server unthrottled and let a model load
+/// thrash the whole machine. The wrapper tools (`ionice`/`nice`/
+/// `taskpolicy`) exec their target, so the resulting child PID is still
+/// ollama and our kill-on-exit handling is unaffected.
 ///
-/// - `"io"` (default): lower **disk IO** priority only. Model loading is
-///   disk-bound and inference is compute-bound, so this keeps the machine
-///   responsive during a load without kneecapping token generation.
-/// - `"aggressive"`: also demote CPU/QoS. Most responsive desktop during
-///   a load, but inference itself runs slower.
+/// - `"io"` (default, "balanced"): a moderate `nice` (CPU) — plus a low
+///   disk-IO class on Linux. `nice` only makes the server yield when
+///   something else (the display server, networking, the WebView) wants
+///   the CPU, so the machine stays responsive during a heavy load while
+///   inference still gets the bulk of the cores when nothing competes.
+///   Crucially it does NOT leave the CPU wide open to the server — which
+///   is what starved the desktop and froze the machine — nor force it
+///   onto efficiency cores like background QoS, so inference isn't
+///   crippled.
+/// - `"aggressive"`: deep `nice` / background QoS — most responsive
+///   desktop during a load, but noticeably slower inference.
 ///
-/// (`"off"` is handled by the caller, which simply doesn't call this.)
-/// Every call is fire-and-forget: a missing tool or permission error just
-/// means no throttle, never a hard failure.
-#[allow(unused_variables)] // `pid`/`mode` are unused on platforms without a branch
-pub async fn lower_priority(pid: u32, mode: &str) {
-    let pid = pid.to_string();
+/// `None` on Windows (it throttles post-spawn via [`set_priority_windows`]
+/// instead) and for `"off"`.
+pub fn throttle_launch_prefix(mode: &str) -> Option<Vec<&'static str>> {
+    if mode == "off" {
+        return None;
+    }
     let aggressive = mode == "aggressive";
     #[cfg(target_os = "linux")]
     {
-        if aggressive {
-            // Idle IO class (only runs when nothing else wants the disk)
-            // plus a CPU nice — maximum desktop responsiveness, slower
-            // load and inference.
-            let _ = quiet_tokio_command("ionice")
-                .args(["-c", "3", "-p", &pid])
-                .status()
-                .await;
-            let _ = quiet_tokio_command("renice")
-                .args(["-n", "5", "-p", &pid])
-                .status()
-                .await;
+        Some(if aggressive {
+            // Max nice + idle IO class — server only runs when nothing
+            // else wants CPU or disk. Snappiest desktop, slowest model.
+            vec!["nice", "-n", "19", "ionice", "-c", "3"]
         } else {
-            // Best-effort IO class, lowest priority (7): still gets disk
-            // time but yields under contention. IO-only — no renice, so
-            // inference keeps full CPU once loaded.
-            let _ = quiet_tokio_command("ionice")
-                .args(["-c", "2", "-n", "7", "-p", &pid])
-                .status()
-                .await;
-        }
+            // Moderate nice so the system keeps headroom, plus low
+            // best-effort IO so disk reads yield under contention. The
+            // server still gets most of the CPU when it's the only thing
+            // running, so inference stays fast.
+            vec!["nice", "-n", "10", "ionice", "-c", "2", "-n", "7"]
+        })
     }
     #[cfg(target_os = "macos")]
     {
-        if aggressive {
-            // Background QoS: demotes to efficiency cores and throttles
-            // both compute and IO. Frees the machine most, but slows
-            // inference noticeably.
-            let _ = quiet_tokio_command("taskpolicy")
-                .args(["-b", "-p", &pid])
-                .status()
-                .await;
+        Some(if aggressive {
+            // Background QoS: efficiency cores + throttled compute & IO.
+            // Frees the machine most, but slows inference.
+            vec!["taskpolicy", "-b"]
         } else {
-            // Disk IO policy "throttle" (IOPOL_THROTTLE) only — leaves CPU
-            // scheduling and QoS untouched, so inference runs on the
-            // performance cores at full speed.
-            let _ = quiet_tokio_command("taskpolicy")
-                .args(["-d", "throttle", "-p", &pid])
-                .status()
-                .await;
-        }
+            // Moderate nice only — reserves CPU headroom for the system
+            // (display, networking) while leaving the server on the
+            // performance cores, so inference isn't kneecapped. `nice` is
+            // POSIX and always present, so this can't fail the launch.
+            vec!["nice", "-n", "10"]
+        })
     }
-    #[cfg(target_os = "windows")]
+    #[cfg(not(any(target_os = "linux", target_os = "macos")))]
     {
-        // Windows doesn't expose per-process IO priority to other
-        // processes without FFI; we nudge the priority class instead —
-        // BelowNormal for the IO tier, Idle (lowest) when aggressive.
-        let class = if aggressive { "Idle" } else { "BelowNormal" };
-        let script = format!("(Get-Process -Id {pid}).PriorityClass='{class}'");
-        let _ = quiet_tokio_command("powershell")
-            .args(["-NoProfile", "-NonInteractive", "-Command", &script])
-            .status()
-            .await;
+        let _ = aggressive; // consumed on platforms without a launch wrapper
+        None
     }
 }
+
+/// Windows-only: ease the spawned server's priority class after spawn.
+/// Windows has no launch-time IO-throttle wrapper we can rely on and no
+/// external per-process IO priority without FFI, so we nudge the priority
+/// class instead — `BelowNormal` for the default, `Idle` when aggressive.
+/// Best-effort; failure just means no throttle.
+#[cfg(target_os = "windows")]
+pub async fn set_priority_windows(pid: u32, mode: &str) {
+    let class = if mode == "aggressive" {
+        "Idle"
+    } else {
+        "BelowNormal"
+    };
+    let script = format!("(Get-Process -Id {pid}).PriorityClass='{class}'");
+    let _ = quiet_tokio_command("powershell")
+        .args(["-NoProfile", "-NonInteractive", "-Command", &script])
+        .status()
+        .await;
+}
diff --git a/src-tauri/src/resolver.rs b/src-tauri/src/resolver.rs
index 95fbb7e..84d53f4 100644
--- a/src-tauri/src/resolver.rs
+++ b/src-tauri/src/resolver.rs
@@ -785,6 +785,7 @@ pub fn default_config_value() -> Value {
         "model_cleanup_days": 1,
         "ollama_keep_alive": "30m",
         "ollama_throttle": "io",
+        "warm_on_startup": true,
         "kept_models": [],
         "mode_overrides": {},
         "tracked_modes": ["text"],
diff --git a/src/config.ts b/src/config.ts
index ade2a2f..43235c6 100644
--- a/src/config.ts
+++ b/src/config.ts
@@ -113,6 +113,7 @@ const DEFAULT_CONFIG: Config = {
   model_cleanup_days: 1,
   ollama_keep_alive: "30m",
   ollama_throttle: "io",
+  warm_on_startup: true,
   cleanup_warning_suppressed_families: [],
   kept_models: [],
   mode_overrides: {},
diff --git a/src/types.ts b/src/types.ts
index 7f064ca..ed92052 100644
--- a/src/types.ts
+++ b/src/types.ts
@@ -504,6 +504,11 @@ export interface Config {
    *  desktop, slower inference). Only applies when MyOwnLLM spawns Ollama
    *  itself — not when it's a system/tray service. */
   ollama_throttle: "off" | "io" | "aggressive";
+  /** Warm (preload) the active chat model in the background at startup so
+   *  the first message doesn't pay the cold-load wait. On by default; the
+   *  load runs under the configured throttle so it doesn't lock up the
+   *  machine. Can be turned off in Settings → Performance. */
+  warm_on_startup: boolean;
   /** Family names for which the user has dismissed the
    *  "switching with auto-cleanup on" confirmation in the family
    *  detail view's per-tier picker. Per-family rather than per-tier
diff --git a/src/ui/App.svelte b/src/ui/App.svelte
index 17871c0..c83b85a 100644
--- a/src/ui/App.svelte
+++ b/src/ui/App.svelte
@@ -277,13 +277,19 @@
       view = "chat";
       invoke("ollama_ensure_running").catch(() => {});
 
-      // Proactively warm the chat model in the background so its
-      // one-time cold load happens now — with the throttled server
-      // keeping the machine responsive — rather than freezing the user
-      // on their first message. Skipped when the model isn't on disk
-      // yet (the download overlay owns that flow) or when keep_alive is
-      // "0" (warming would just load-then-unload). Fire-and-forget.
-      if (pendingTextModel && !textModelMissing && config.ollama_keep_alive !== "0") {
+      // Warm the chat model in the background so the first message doesn't
+      // pay the cold-load wait. On by default; the load runs under the
+      // configured throttle (Settings → Performance) so it doesn't lock up
+      // the machine. Skipped when the user turned it off, when the model
+      // isn't on disk yet (the download overlay owns that), or when
+      // keep_alive is "0" (warming would just load-then-unload).
+      // Fire-and-forget.
+      if (
+        config.warm_on_startup !== false &&
+        pendingTextModel &&
+        !textModelMissing &&
+        config.ollama_keep_alive !== "0"
+      ) {
         invoke("ollama_warm", { model: pendingTextModel }).catch(() => {});
       }
 
diff --git a/src/ui/settings/PerformanceSection.svelte b/src/ui/settings/PerformanceSection.svelte
index 95a4d94..54d939c 100644
--- a/src/ui/settings/PerformanceSection.svelte
+++ b/src/ui/settings/PerformanceSection.svelte
@@ -26,25 +26,30 @@
     {
       value: "off",
       label: "Off (fastest load)",
-      hint: "No throttle. Loads fastest, but a big model can briefly bog down the whole machine.",
+      hint: "No throttle. Loads fastest, but a big model can saturate the CPU and briefly freeze the machine.",
     },
     {
       value: "io",
-      label: "Balanced — disk only (recommended)",
-      hint: "Eases disk priority during the load so the machine stays responsive, while inference keeps running at full speed.",
+      label: "Balanced (recommended)",
+      hint: "Lowers the model's priority a notch so the system — display, networking — keeps enough CPU to stay responsive during a load, while inference still gets the bulk of the cores.",
     },
     {
       value: "aggressive",
       label: "Aggressive (most responsive)",
-      hint: "Also lowers CPU priority. Keeps the desktop snappiest during a load, but token generation runs slower.",
+      hint: "Deeply deprioritizes the model. Keeps the desktop snappiest during a load, but token generation runs noticeably slower.",
     },
   ];
 
+  /** Preload the chat model at startup so the first message is instant.
+   *  On by default; the load runs under the throttle above. */
+  let warmOnStartup = $state(true);
+
   onMount(async () => {
     try {
       const config = await loadConfig();
       keepAlive = config.ollama_keep_alive ?? "30m";
       throttle = (config.ollama_throttle ?? "io") as Throttle;
+      warmOnStartup = config.warm_on_startup ?? true;
     } catch (e) {
       error = String(e);
     } finally {
@@ -52,6 +57,11 @@
     }
   });
 
+  async function patchWarmOnStartup(value: boolean) {
+    warmOnStartup = value;
+    await updateConfig({ warm_on_startup: value });
+  }
+
   async function patchKeepAlive(value: string) {
     keepAlive = value;
     await updateConfig({ ollama_keep_alive: value });
@@ -144,6 +154,23 @@
         {/if}
       </div>
 
+      <div class="card">
+        <div class="card-title">Warm at startup</div>
+        <p class="card-meta">
+          Preload the chat model in the background when the app starts, so
+          your first message doesn't wait for it to load. The load runs
+          under the throttle above, so it won't lock up the machine.
+        </p>
+        <label class="toggle">
+          <input
+            type="checkbox"
+            checked={warmOnStartup}
+            onchange={(e) => patchWarmOnStartup((e.currentTarget as HTMLInputElement).checked)}
+          />
+          Warm the chat model at startup
+        </label>
+      </div>
+
       <p class="footnote">
         Throttling only applies when MyOwnLLM starts the Ollama server
         itself. If Ollama is already running as a system or tray service,
@@ -204,4 +231,14 @@
   select:focus { outline: none; border-color: #6e6ef7; }
 
   .footnote { font-size: .72rem; color: #555; line-height: 1.5; padding: .35rem .15rem 0; margin: 0; }
+
+  .toggle {
+    display: inline-flex;
+    align-items: center;
+    gap: .45rem;
+    font-size: .82rem;
+    color: #ccc;
+    cursor: pointer;
+  }
+  .toggle input { accent-color: #6e6ef7; }
 </style>

From 67562ee7f4941f8681396419ae6d39b6d663e926 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Fri, 29 May 2026 06:35:39 +0000
Subject: [PATCH 10/12] Make the cold-start indicator inline, not a modal
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace the floating load dialog with an in-bubble indicator that takes
the place of the typing dots while the model loads — no jolting overlay.
Minimal prose: a reassurance word that rotates every 3s with a moving
shine (recreated per change so it fades in), plus a quiet live CPU/RAM
line as proof the machine is still working. The composer's Stop button
already covers cancel, so the modal's Cancel/heading/spinner are gone.
---
 src/ui/Chat.svelte | 280 ++++++++++++++-------------------------------
 1 file changed, 84 insertions(+), 196 deletions(-)

diff --git a/src/ui/Chat.svelte b/src/ui/Chat.svelte
index e905a4f..7a1558c 100644
--- a/src/ui/Chat.svelte
+++ b/src/ui/Chat.svelte
@@ -134,6 +134,32 @@
   let modelLoading = $state(false);
   let modelLoadTimer: ReturnType<typeof setTimeout> | null = null;
 
+  /** Reassurance words for the cold-start indicator — cycled every few
+   *  seconds (with a shine) so the user can see something is alive while
+   *  the model loads. Short and low-key; the rotation itself is the
+   *  "still working, not frozen" signal. */
+  const LOADING_WORDS = [
+    "Working on it…",
+    "Loading the model…",
+    "Warming up…",
+    "Reading the weights…",
+    "Getting set up…",
+    "Hang tight…",
+    "Almost there…",
+  ];
+  const LOADING_WORD_MS = 3000;
+  let loadingWordIdx = $state(0);
+  // Rotate the reassurance word while a load is in progress. The $effect's
+  // cleanup clears the interval the moment `modelLoading` goes false.
+  $effect(() => {
+    if (!modelLoading) return;
+    loadingWordIdx = 0;
+    const id = setInterval(() => {
+      loadingWordIdx = (loadingWordIdx + 1) % LOADING_WORDS.length;
+    }, LOADING_WORD_MS);
+    return () => clearInterval(id);
+  });
+
   /** Live CPU/RAM/GPU snapshot shown inside the load-wait dialog so
    *  the user can see *why* it's slow (e.g. RAM near full → the model
    *  is paging in from disk). Reuses the same `usage_live_snapshot`
@@ -205,11 +231,6 @@
     if (bytes == null) return "—";
     return `${(bytes / 1024 ** 3).toFixed(1)} GB`;
   }
-  /** RAM-in-use percentage for the mini bar, or null when unknown. */
-  function ramUsedPct(): number | null {
-    if (!liveStats?.ram_used_bytes || !liveStats?.ram_total_bytes) return null;
-    return (liveStats.ram_used_bytes / liveStats.ram_total_bytes) * 100;
-  }
 
   /** One pending attachment staged for the next send. Images become
    *  Ollama-style `images: [base64]` array entries on the user
@@ -1383,7 +1404,30 @@
     {/each}
     {#if streaming && (messages.length === 0 || messages[messages.length - 1].role !== "assistant")}
       <div class="message assistant">
-        <div class="bubble"><span class="dots"><span></span><span></span><span></span></span></div>
+        <div class="bubble">
+          {#if modelLoading}
+            <!-- Cold-start: the model is (re)loading into memory. Replace
+                 the typing dots in place (no jolting modal) with a calmer
+                 reassurance — a word that rotates every few seconds with a
+                 shine, plus a quiet live CPU/RAM line as proof the machine
+                 is still working, not frozen. -->
+            <div class="loading-inline" aria-live="polite">
+              {#key loadingWordIdx}
+                <span class="loading-word">{LOADING_WORDS[loadingWordIdx]}</span>
+              {/key}
+              {#if !routeViaDevicePubkey && liveStats}
+                <span class="loading-meta">
+                  {#if liveStats.cpu_total_pct != null}CPU {Math.round(liveStats.cpu_total_pct)}%{/if}
+                  {#if liveStats.ram_used_bytes != null && liveStats.ram_total_bytes != null}
+                    · RAM {fmtGb(liveStats.ram_used_bytes)}/{fmtGb(liveStats.ram_total_bytes)}
+                  {/if}
+                </span>
+              {/if}
+            </div>
+          {:else}
+            <span class="dots"><span></span><span></span><span></span></span>
+          {/if}
+        </div>
       </div>
     {/if}
   </div>
@@ -1517,74 +1561,6 @@
   {/if}
   </div>
 
-  {#if modelLoading}
-    <!-- Cold-start dialog. Shown only when the first token hasn't
-         arrived within MODEL_LOAD_POPUP_DELAY_MS — i.e. the model is
-         (re)loading into memory. Non-blocking: the user can keep
-         reading the transcript behind it, and Cancel aborts the run. -->
-    <div class="model-loading-backdrop" role="dialog" aria-modal="false" aria-live="polite">
-      <div class="model-loading-card">
-        <div class="model-loading-head">
-          <div class="spinner" aria-hidden="true"></div>
-          <div class="model-loading-text">
-            {#if routeViaDevicePubkey}
-              <p class="model-loading-title">Waiting for {routedPeer?.label ?? "the host"}…</p>
-              <p class="model-loading-sub">The host is loading its model. This can take a few seconds.</p>
-            {:else}
-              <p class="model-loading-title">Loading {activeModel}…</p>
-              <p class="model-loading-sub">First use reads the model into memory. This is usually a one-time wait — later replies start instantly.</p>
-            {/if}
-          </div>
-          <button class="model-loading-cancel" onclick={stop}>Cancel</button>
-        </div>
-
-        <!-- Live resource readout — only meaningful for local loads
-             (a remote model loads on the host's machine, not this
-             one). Bars + figures come from usage_live_snapshot, the
-             same lookup the Usage settings tab uses. -->
-        {#if !routeViaDevicePubkey}
-          <div class="model-loading-stats">
-            <div class="stat">
-              <div class="stat-row">
-                <span class="stat-label">CPU</span>
-                <span class="stat-val">{liveStats?.cpu_total_pct != null ? `${Math.round(liveStats.cpu_total_pct)}%` : "—"}</span>
-              </div>
-              <div class="meter"><div class="meter-fill" style="width: {Math.min(100, Math.max(0, liveStats?.cpu_total_pct ?? 0))}%"></div></div>
-            </div>
-            <div class="stat">
-              <div class="stat-row">
-                <span class="stat-label">RAM</span>
-                <span class="stat-val">
-                  {#if liveStats?.ram_used_bytes != null && liveStats?.ram_total_bytes != null}
-                    {fmtGb(liveStats.ram_used_bytes)} / {fmtGb(liveStats.ram_total_bytes)}
-                  {:else}—{/if}
-                </span>
-              </div>
-              <div class="meter"><div class="meter-fill" class:hot={(ramUsedPct() ?? 0) >= 90} style="width: {Math.min(100, Math.max(0, ramUsedPct() ?? 0))}%"></div></div>
-            </div>
-            {#if liveStats?.gpu_pct != null || liveStats?.vram_total_bytes != null}
-              <div class="stat">
-                <div class="stat-row">
-                  <span class="stat-label">GPU</span>
-                  <span class="stat-val">
-                    {liveStats?.gpu_pct != null ? `${Math.round(liveStats.gpu_pct)}%` : "—"}
-                    {#if liveStats?.vram_used_bytes != null && liveStats?.vram_total_bytes != null}
-                      <span class="stat-sub">· VRAM {fmtGb(liveStats.vram_used_bytes)} / {fmtGb(liveStats.vram_total_bytes)}</span>
-                    {/if}
-                  </span>
-                </div>
-                <div class="meter"><div class="meter-fill" style="width: {Math.min(100, Math.max(0, liveStats?.gpu_pct ?? 0))}%"></div></div>
-              </div>
-            {/if}
-            {#if hardware?.disk_free_gb != null}
-              <p class="stat-disk">Disk free: {hardware.disk_free_gb.toFixed(1)} GB</p>
-            {/if}
-          </div>
-        {/if}
-      </div>
-    </div>
-  {/if}
-
   {#if settingsTab}
     <SettingsPanel
       initialTab={settingsTab}
@@ -1610,136 +1586,48 @@
     position: relative;
   }
 
-  /* Cold-start model-loading dialog. Floats over the chat surface
-     without blocking it (pointer-events scoped to the card). */
-  .model-loading-backdrop {
-    position: absolute;
-    inset: 0;
-    display: flex;
-    align-items: center;
-    justify-content: center;
-    background: rgba(0, 0, 0, 0.45);
-    z-index: 40;
-    pointer-events: none;
-    animation: model-loading-fade 0.18s ease-out;
-  }
-  @keyframes model-loading-fade {
-    from { opacity: 0; }
-    to { opacity: 1; }
-  }
-  .model-loading-card {
-    pointer-events: auto;
-    display: flex;
-    flex-direction: column;
-    gap: 0.85rem;
-    width: 25rem;
-    max-width: calc(100% - 2rem);
-    padding: 1.1rem 1.25rem;
-    background: #181818;
-    border: 1px solid #2a2a2a;
-    border-radius: 12px;
-    box-shadow: 0 12px 40px rgba(0, 0, 0, 0.5);
-  }
-  .model-loading-head {
-    display: flex;
-    align-items: center;
-    gap: 0.9rem;
-  }
-  .model-loading-card .spinner {
-    flex: none;
-    width: 24px;
-    height: 24px;
-    border: 3px solid #333;
-    border-top-color: #6e6ef7;
-    border-radius: 50%;
-    animation: spin 0.8s linear infinite;
-  }
-  @keyframes spin {
-    to { transform: rotate(360deg); }
-  }
-  .model-loading-text {
-    flex: 1;
-    min-width: 0;
-  }
-  .model-loading-title {
-    margin: 0;
-    color: #e8e8e8;
-    font-size: 0.95rem;
-    font-weight: 600;
-  }
-  .model-loading-sub {
-    margin: 0.25rem 0 0;
-    color: #999;
-    font-size: 0.8rem;
-    line-height: 1.35;
-  }
-  .model-loading-cancel {
-    flex: none;
-    align-self: flex-start;
-    background: none;
-    border: 1px solid #2a2a2a;
-    border-radius: 6px;
-    color: #bbb;
-    padding: 0.3rem 0.6rem;
-    font-size: 0.8rem;
-    cursor: pointer;
-  }
-  .model-loading-cancel:hover {
-    border-color: #3a3a55;
-    color: #ddd;
-  }
-  .model-loading-stats {
-    display: flex;
-    flex-direction: column;
-    gap: 0.55rem;
-    padding-top: 0.85rem;
-    border-top: 1px solid #242424;
-  }
-  .model-loading-stats .stat {
+  /* Cold-start inline indicator — sits in the assistant bubble in place
+     of the typing dots while the model loads. A reassurance word with a
+     moving shine, recreated on each rotation so it fades in, plus a quiet
+     live CPU/RAM line. */
+  .loading-inline {
     display: flex;
     flex-direction: column;
     gap: 0.25rem;
   }
-  .model-loading-stats .stat-row {
-    display: flex;
-    justify-content: space-between;
-    align-items: baseline;
-    gap: 0.5rem;
-    font-size: 0.76rem;
-  }
-  .model-loading-stats .stat-label {
-    color: #888;
-    text-transform: uppercase;
-    letter-spacing: 0.04em;
-    font-size: 0.68rem;
-  }
-  .model-loading-stats .stat-val {
-    color: #ccc;
-    font-variant-numeric: tabular-nums;
-  }
-  .model-loading-stats .stat-sub {
-    color: #777;
-    font-size: 0.72rem;
-  }
-  .model-loading-stats .meter {
-    height: 5px;
-    background: #242424;
-    border-radius: 3px;
-    overflow: hidden;
+  .loading-word {
+    display: inline-block;
+    font-size: 0.9rem;
+    font-weight: 500;
+    background: linear-gradient(
+      90deg,
+      #8a8a8a 0%,
+      #8a8a8a 38%,
+      #eaeaff 50%,
+      #8a8a8a 62%,
+      #8a8a8a 100%
+    );
+    background-size: 220% 100%;
+    -webkit-background-clip: text;
+    background-clip: text;
+    -webkit-text-fill-color: transparent;
+    color: transparent;
+    animation:
+      loading-word-in 0.4s ease-out,
+      loading-shine 2.4s linear infinite;
   }
-  .model-loading-stats .meter-fill {
-    height: 100%;
-    background: #6e6ef7;
-    border-radius: 3px;
-    transition: width 0.4s ease;
+  @keyframes loading-shine {
+    0% { background-position: 160% 0; }
+    100% { background-position: -160% 0; }
   }
-  .model-loading-stats .meter-fill.hot {
-    background: #e35a5a;
+  @keyframes loading-word-in {
+    from { opacity: 0; transform: translateY(2px); }
+    to { opacity: 1; transform: translateY(0); }
   }
-  .model-loading-stats .stat-disk {
-    margin: 0.1rem 0 0;
+  .loading-meta {
     font-size: 0.72rem;
-    color: #777;
+    color: #6a6a6a;
+    font-variant-numeric: tabular-nums;
   }
   .chat-body {
     flex: 1;

From 97dfe11500f4a846d4ab5e726af5c6f0ac1a931a Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Fri, 29 May 2026 06:45:58 +0000
Subject: [PATCH 11/12] Hold a loading screen during startup warm; share the
 indicator
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Extract the cold-start indicator (rotating shining word + live CPU/RAM)
into a reusable LoadingPulse component and use it in two places:

- In chat: still shown in place of the typing dots whenever a call is
  slow (cold load or a long-running turn) — unchanged behavior, now via
  the shared component.
- At startup: when warm-on-startup runs, hold a full-screen loading
  screen (spinner + LoadingPulse beneath it) over the chat until the
  model is resident, instead of dropping into a chat that feels sluggish
  while it competes with the cold load. The chat still mounts behind the
  screen, so it's ready the moment the screen lifts; a Continue button is
  the escape hatch.

LoadingPulse self-manages its word rotation and usage poll (mount/
unmount lifecycle), so Chat no longer hand-rolls those.
---
 src/ui/App.svelte          |  77 +++++++++++++++--
 src/ui/Chat.svelte         | 172 +++++--------------------------------
 src/ui/LoadingPulse.svelte | 113 ++++++++++++++++++++++++
 3 files changed, 204 insertions(+), 158 deletions(-)
 create mode 100644 src/ui/LoadingPulse.svelte

diff --git a/src/ui/App.svelte b/src/ui/App.svelte
index c83b85a..75f6a26 100644
--- a/src/ui/App.svelte
+++ b/src/ui/App.svelte
@@ -7,6 +7,7 @@
   import Chat from "./Chat.svelte";
   import TranscribeView from "./TranscribeView.svelte";
   import Sidebar from "./Sidebar.svelte";
+  import LoadingPulse from "./LoadingPulse.svelte";
   import PermissionPromptModal from "./PermissionPromptModal.svelte";
   import { loadConfig, updateConfig } from "../config";
   import { getActiveManifest } from "../providers";
@@ -83,6 +84,12 @@
   type View = "loading" | "chat";
 
   let view = $state<View>("loading");
+  /** True while the startup warm is in flight: we keep a full-screen
+   *  loading screen up (rather than dropping into a chat that feels
+   *  sluggish because it's competing with the cold load) until the model
+   *  is resident. The chat still mounts behind it, so it's fully ready
+   *  when the screen lifts. A "Continue" button is the escape hatch. */
+  let warming = $state(false);
   let appVersion = $state("");
   let hardware = $state<HardwareProfile | null>(null);
   let activeModel = $state("");
@@ -277,20 +284,28 @@
       view = "chat";
       invoke("ollama_ensure_running").catch(() => {});
 
-      // Warm the chat model in the background so the first message doesn't
-      // pay the cold-load wait. On by default; the load runs under the
-      // configured throttle (Settings → Performance) so it doesn't lock up
-      // the machine. Skipped when the user turned it off, when the model
-      // isn't on disk yet (the download overlay owns that), or when
-      // keep_alive is "0" (warming would just load-then-unload).
-      // Fire-and-forget.
+      // Warm the chat model so the first message doesn't pay the cold-load
+      // wait. On by default; the load runs under the configured throttle
+      // (Settings → Performance). Skipped when the user turned it off, when
+      // the model isn't on disk yet (the download overlay owns that), or
+      // when keep_alive is "0" (warming would just load-then-unload).
+      //
+      // We hold a full-screen loading screen (`warming`) over the chat —
+      // which keeps mounting/initializing behind it — until the warm
+      // settles, so the user lands on a chat that's actually ready instead
+      // of one that feels sluggish while it competes with the cold load.
       if (
         config.warm_on_startup !== false &&
         pendingTextModel &&
         !textModelMissing &&
         config.ollama_keep_alive !== "0"
       ) {
-        invoke("ollama_warm", { model: pendingTextModel }).catch(() => {});
+        warming = true;
+        invoke("ollama_warm", { model: pendingTextModel })
+          .catch(() => {})
+          .finally(() => {
+            warming = false;
+          });
       }
 
       kickUpdateCheck();
@@ -1160,6 +1175,23 @@
        the same modal; the modal self-hides when the prompt queue
        drains. -->
   <PermissionPromptModal />
+
+  {#if warming}
+    <!-- Startup warm: keep a loading screen up until the model is
+         resident. The chat mounts behind this, so it's ready the moment
+         the screen lifts. Same shining word + live CPU/RAM as the in-chat
+         indicator, under the spinner. Continue is the escape hatch. -->
+    <div class="warming-overlay">
+      <div class="spinner"></div>
+      <LoadingPulse showStats={true} />
+      <button class="warming-skip" onclick={() => (warming = false)}>
+        Continue to chat →
+      </button>
+      {#if appVersion}
+        <p class="splash-version">v{appVersion}</p>
+      {/if}
+    </div>
+  {/if}
 </div>
 
 <style>
@@ -1334,6 +1366,35 @@
     color: #555;
     margin-top: -0.5rem;
   }
+  /* Startup-warm loading screen. Full-screen, opaque, same look as the
+     initial splash (spinner on top, LoadingPulse just beneath). */
+  .warming-overlay {
+    position: fixed;
+    inset: 0;
+    z-index: 55;
+    background: #111;
+    display: flex;
+    flex-direction: column;
+    align-items: center;
+    justify-content: center;
+    gap: 1rem;
+    color: #888;
+  }
+  .warming-skip {
+    margin-top: 0.25rem;
+    background: none;
+    border: 1px solid #2a2a2a;
+    color: #888;
+    padding: 0.4rem 0.8rem;
+    border-radius: 6px;
+    font-size: 0.8rem;
+    cursor: pointer;
+    transition: color 0.12s, border-color 0.12s;
+  }
+  .warming-skip:hover {
+    color: #ccc;
+    border-color: #3a3a55;
+  }
   .spinner {
     width: 28px;
     height: 28px;
diff --git a/src/ui/Chat.svelte b/src/ui/Chat.svelte
index 7a1558c..4e99b3a 100644
--- a/src/ui/Chat.svelte
+++ b/src/ui/Chat.svelte
@@ -1,10 +1,11 @@
 <script lang="ts">
   import { invoke } from "@tauri-apps/api/core";
-  import { onDestroy, tick } from "svelte";
+  import { tick } from "svelte";
   import TopBar from "./TopBar.svelte";
   import TextBar from "./TextBar.svelte";
   import SettingsPanel from "./SettingsPanel.svelte";
   import DownloadOverlay from "./DownloadOverlay.svelte";
+  import LoadingPulse from "./LoadingPulse.svelte";
   import {
     loadConversation,
     saveConversation,
@@ -15,7 +16,7 @@
     type ToolCall,
   } from "../conversations";
   import type { SettingsTab } from "../update-state.svelte";
-  import type { HardwareProfile, Mode, LiveSnapshot } from "../types";
+  import type { HardwareProfile, Mode } from "../types";
   import {
     chatSlot,
     claimChat,
@@ -124,99 +125,33 @@
   /** Cold-start UX: Ollama loads a model's weights into RAM/VRAM on
    *  the first request after it was evicted (or never loaded this
    *  session). That gap — between firing the chat stream and the
-   *  first token coming back — is silent today. If it runs long we
-   *  surface a small "loading the model" dialog so the user knows the
-   *  app isn't wedged. Once any frame arrives (delta, tool call, or
-   *  terminal event) the model is resident and we tear the dialog
-   *  down; we don't re-arm it for later turns in the same run since
-   *  the model is warm by then. */
+   *  first token coming back — is otherwise silent. When it runs long
+   *  we swap the typing dots for a `LoadingPulse` (rotating reassurance
+   *  word + live CPU/RAM) so the user knows the app isn't wedged. Once
+   *  any frame arrives (delta, tool call, or terminal event) the model
+   *  is resident and we clear it; we don't re-arm for later turns in
+   *  the same run since the model is warm by then. The indicator owns
+   *  its own word rotation + usage poll, mounting/unmounting with
+   *  `modelLoading`. */
   const MODEL_LOAD_POPUP_DELAY_MS = 5000;
   let modelLoading = $state(false);
   let modelLoadTimer: ReturnType<typeof setTimeout> | null = null;
 
-  /** Reassurance words for the cold-start indicator — cycled every few
-   *  seconds (with a shine) so the user can see something is alive while
-   *  the model loads. Short and low-key; the rotation itself is the
-   *  "still working, not frozen" signal. */
-  const LOADING_WORDS = [
-    "Working on it…",
-    "Loading the model…",
-    "Warming up…",
-    "Reading the weights…",
-    "Getting set up…",
-    "Hang tight…",
-    "Almost there…",
-  ];
-  const LOADING_WORD_MS = 3000;
-  let loadingWordIdx = $state(0);
-  // Rotate the reassurance word while a load is in progress. The $effect's
-  // cleanup clears the interval the moment `modelLoading` goes false.
-  $effect(() => {
-    if (!modelLoading) return;
-    loadingWordIdx = 0;
-    const id = setInterval(() => {
-      loadingWordIdx = (loadingWordIdx + 1) % LOADING_WORDS.length;
-    }, LOADING_WORD_MS);
-    return () => clearInterval(id);
-  });
-
-  /** Live CPU/RAM/GPU snapshot shown inside the load-wait dialog so
-   *  the user can see *why* it's slow (e.g. RAM near full → the model
-   *  is paging in from disk). Reuses the same `usage_live_snapshot`
-   *  command + cadence the Usage settings tab uses, so there's a
-   *  single source of truth for system lookups. Polled only while the
-   *  dialog is up; null when idle. */
-  let liveStats = $state<LiveSnapshot | null>(null);
-  let statsPollHandle: ReturnType<typeof setInterval> | null = null;
-  /** Poll cadence for the load-wait readout. Faster than the Usage
-   *  tab's 2s so a short load still gets a real CPU% reading (the
-   *  first sample only primes the delta cache). */
-  const STATS_POLL_MS = 1200;
-
-  async function refreshLiveStats() {
-    try {
-      liveStats = await invoke<LiveSnapshot>("usage_live_snapshot");
-    } catch {
-      // Non-fatal: the dialog still shows the spinner + copy without
-      // the resource readout if the snapshot command is unavailable.
-    }
-  }
-
-  function startStatsPoll() {
-    if (statsPollHandle !== null) return;
-    void refreshLiveStats(); // prime the CPU delta cache immediately
-    statsPollHandle = setInterval(() => void refreshLiveStats(), STATS_POLL_MS);
-  }
-
-  function stopStatsPoll() {
-    if (statsPollHandle !== null) {
-      clearInterval(statsPollHandle);
-      statsPollHandle = null;
-    }
-    liveStats = null;
-  }
-
-  /** Clear the load-wait dialog + its arming timer and stop the
-   *  resource poll. Idempotent, so it's safe to call from every agent
-   *  event and from cleanup. */
+  /** Clear the load indicator + its arming timer. Idempotent, so it's
+   *  safe to call from every agent event and from cleanup. */
   function clearModelLoadWait() {
     if (modelLoadTimer !== null) {
       clearTimeout(modelLoadTimer);
       modelLoadTimer = null;
     }
     if (modelLoading) modelLoading = false;
-    stopStatsPoll();
   }
 
-  // Belt-and-suspenders: never leak the interval if the panel is torn
-  // down (mode switch, conversation close) mid-load.
-  onDestroy(stopStatsPoll);
-
   /** Resolve once the browser has actually painted: a Svelte tick to
    *  flush the DOM update, then two animation frames so the compositor
-   *  draws the frame. We await this after showing the load dialog and
+   *  draws the frame. We await this after showing the indicator and
    *  before kicking off a cold model load — a heavy load can thrash a
-   *  laptop badly enough that an un-painted dialog would never appear. */
+   *  laptop badly enough that an un-painted indicator would never appear. */
   function nextPaint(): Promise<void> {
     return tick().then(
       () =>
@@ -226,12 +161,6 @@
     );
   }
 
-  /** Format a byte count as a compact GB string for the readout. */
-  function fmtGb(bytes: number | null | undefined): string {
-    if (bytes == null) return "—";
-    return `${(bytes / 1024 ** 3).toFixed(1)} GB`;
-  }
-
   /** One pending attachment staged for the next send. Images become
    *  Ollama-style `images: [base64]` array entries on the user
    *  message; text-like files (JSON, configs, source, plain text)
@@ -966,12 +895,10 @@
     }
     if (coldStart) {
       modelLoading = true;
-      startStatsPoll();
-      await nextPaint(); // get the dialog on screen before the load freeze
+      await nextPaint(); // get the indicator on screen before the load freeze
     } else {
       modelLoadTimer = setTimeout(() => {
         modelLoading = true;
-        startStatsPoll();
       }, MODEL_LOAD_POPUP_DELAY_MS);
     }
 
@@ -1406,24 +1333,12 @@
       <div class="message assistant">
         <div class="bubble">
           {#if modelLoading}
-            <!-- Cold-start: the model is (re)loading into memory. Replace
-                 the typing dots in place (no jolting modal) with a calmer
-                 reassurance — a word that rotates every few seconds with a
-                 shine, plus a quiet live CPU/RAM line as proof the machine
-                 is still working, not frozen. -->
-            <div class="loading-inline" aria-live="polite">
-              {#key loadingWordIdx}
-                <span class="loading-word">{LOADING_WORDS[loadingWordIdx]}</span>
-              {/key}
-              {#if !routeViaDevicePubkey && liveStats}
-                <span class="loading-meta">
-                  {#if liveStats.cpu_total_pct != null}CPU {Math.round(liveStats.cpu_total_pct)}%{/if}
-                  {#if liveStats.ram_used_bytes != null && liveStats.ram_total_bytes != null}
-                    · RAM {fmtGb(liveStats.ram_used_bytes)}/{fmtGb(liveStats.ram_total_bytes)}
-                  {/if}
-                </span>
-              {/if}
-            </div>
+            <!-- Cold-start (or a long-running call): the model is loading
+                 / still working. Replace the typing dots in place (no
+                 jolting modal) with the calmer LoadingPulse — a rotating
+                 reassurance word + live CPU/RAM. Stats are hidden for the
+                 remote path (the load is on the host's machine). -->
+            <LoadingPulse showStats={!routeViaDevicePubkey} />
           {:else}
             <span class="dots"><span></span><span></span><span></span></span>
           {/if}
@@ -1586,49 +1501,6 @@
     position: relative;
   }
 
-  /* Cold-start inline indicator — sits in the assistant bubble in place
-     of the typing dots while the model loads. A reassurance word with a
-     moving shine, recreated on each rotation so it fades in, plus a quiet
-     live CPU/RAM line. */
-  .loading-inline {
-    display: flex;
-    flex-direction: column;
-    gap: 0.25rem;
-  }
-  .loading-word {
-    display: inline-block;
-    font-size: 0.9rem;
-    font-weight: 500;
-    background: linear-gradient(
-      90deg,
-      #8a8a8a 0%,
-      #8a8a8a 38%,
-      #eaeaff 50%,
-      #8a8a8a 62%,
-      #8a8a8a 100%
-    );
-    background-size: 220% 100%;
-    -webkit-background-clip: text;
-    background-clip: text;
-    -webkit-text-fill-color: transparent;
-    color: transparent;
-    animation:
-      loading-word-in 0.4s ease-out,
-      loading-shine 2.4s linear infinite;
-  }
-  @keyframes loading-shine {
-    0% { background-position: 160% 0; }
-    100% { background-position: -160% 0; }
-  }
-  @keyframes loading-word-in {
-    from { opacity: 0; transform: translateY(2px); }
-    to { opacity: 1; transform: translateY(0); }
-  }
-  .loading-meta {
-    font-size: 0.72rem;
-    color: #6a6a6a;
-    font-variant-numeric: tabular-nums;
-  }
   .chat-body {
     flex: 1;
     min-height: 0;
diff --git a/src/ui/LoadingPulse.svelte b/src/ui/LoadingPulse.svelte
new file mode 100644
index 0000000..fdff5f2
--- /dev/null
+++ b/src/ui/LoadingPulse.svelte
@@ -0,0 +1,113 @@
+<script lang="ts">
+  // Calm "still working, not frozen" indicator: a reassurance word that
+  // rotates every few seconds with a moving shine, plus an optional quiet
+  // live CPU/RAM line as proof of life. Self-contained — it owns its word
+  // rotation and (when showStats) its usage poll — so it can be dropped in
+  // both the in-chat cold-start bubble and the startup warming screen.
+  import { onMount, onDestroy } from "svelte";
+  import { invoke } from "@tauri-apps/api/core";
+  import type { LiveSnapshot } from "../types";
+
+  let { showStats = true }: { showStats?: boolean } = $props();
+
+  const WORDS = [
+    "Working on it…",
+    "Loading the model…",
+    "Warming up…",
+    "Reading the weights…",
+    "Getting set up…",
+    "Hang tight…",
+    "Almost there…",
+  ];
+  const WORD_MS = 3000;
+  const STATS_POLL_MS = 1200;
+
+  let wordIdx = $state(0);
+  let live = $state<LiveSnapshot | null>(null);
+  let wordTimer: ReturnType<typeof setInterval> | null = null;
+  let statsTimer: ReturnType<typeof setInterval> | null = null;
+
+  function fmtGb(bytes: number | null | undefined): string {
+    if (bytes == null) return "—";
+    return `${(bytes / 1024 ** 3).toFixed(1)} GB`;
+  }
+
+  async function refresh() {
+    try {
+      live = await invoke<LiveSnapshot>("usage_live_snapshot");
+    } catch {
+      // Non-fatal: the word still rotates without the resource line.
+    }
+  }
+
+  onMount(() => {
+    wordTimer = setInterval(() => {
+      wordIdx = (wordIdx + 1) % WORDS.length;
+    }, WORD_MS);
+    if (showStats) {
+      void refresh(); // prime the CPU delta cache immediately
+      statsTimer = setInterval(() => void refresh(), STATS_POLL_MS);
+    }
+  });
+
+  onDestroy(() => {
+    if (wordTimer) clearInterval(wordTimer);
+    if (statsTimer) clearInterval(statsTimer);
+  });
+</script>
+
+<div class="loading-inline" aria-live="polite">
+  {#key wordIdx}
+    <span class="loading-word">{WORDS[wordIdx]}</span>
+  {/key}
+  {#if showStats && live}
+    <span class="loading-meta">
+      {#if live.cpu_total_pct != null}CPU {Math.round(live.cpu_total_pct)}%{/if}
+      {#if live.ram_used_bytes != null && live.ram_total_bytes != null}
+        · RAM {fmtGb(live.ram_used_bytes)}/{fmtGb(live.ram_total_bytes)}
+      {/if}
+    </span>
+  {/if}
+</div>
+
+<style>
+  .loading-inline {
+    display: flex;
+    flex-direction: column;
+    gap: 0.25rem;
+  }
+  .loading-word {
+    display: inline-block;
+    font-size: 0.9rem;
+    font-weight: 500;
+    background: linear-gradient(
+      90deg,
+      #8a8a8a 0%,
+      #8a8a8a 38%,
+      #eaeaff 50%,
+      #8a8a8a 62%,
+      #8a8a8a 100%
+    );
+    background-size: 220% 100%;
+    -webkit-background-clip: text;
+    background-clip: text;
+    -webkit-text-fill-color: transparent;
+    color: transparent;
+    animation:
+      loading-word-in 0.4s ease-out,
+      loading-shine 2.4s linear infinite;
+  }
+  @keyframes loading-shine {
+    0% { background-position: 160% 0; }
+    100% { background-position: -160% 0; }
+  }
+  @keyframes loading-word-in {
+    from { opacity: 0; transform: translateY(2px); }
+    to { opacity: 1; transform: translateY(0); }
+  }
+  .loading-meta {
+    font-size: 0.72rem;
+    color: #6a6a6a;
+    font-variant-numeric: tabular-nums;
+  }
+</style>

From 2fe04a39275e0980c3541406fb2a02ba08770c0f Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Fri, 29 May 2026 06:49:16 +0000
Subject: [PATCH 12/12] Make loading indicator phrases ambiguous

The indicator covers both a cold model load and a slow in-progress
turn, so model-specific phrases (Loading the model / Reading the
weights / Warming up) wrongly implied a reload mid-chat. Swap for
neutral 'work is underway' phrases that fit either case.
---
 src/ui/LoadingPulse.svelte | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/ui/LoadingPulse.svelte b/src/ui/LoadingPulse.svelte
index fdff5f2..cbbf92d 100644
--- a/src/ui/LoadingPulse.svelte
+++ b/src/ui/LoadingPulse.svelte
@@ -10,13 +10,17 @@
 
   let { showStats = true }: { showStats?: boolean } = $props();
 
+  // Deliberately ambiguous about *what's* happening: this same indicator
+  // covers both a cold model load and a slow in-progress turn, so phrases
+  // like "Loading the model…" would wrongly suggest a reload mid-chat.
+  // These just reassure that work is underway, whatever the cause.
   const WORDS = [
     "Working on it…",
-    "Loading the model…",
-    "Warming up…",
-    "Reading the weights…",
-    "Getting set up…",
+    "Thinking it through…",
+    "Crunching…",
     "Hang tight…",
+    "Still working…",
+    "Just a moment…",
     "Almost there…",
   ];
   const WORD_MS = 3000;