From 3828dff78785fab207515a8f3e864d1c2df89712 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 29 May 2026 01:19:00 +0000 Subject: [PATCH 01/12] Add cold-start model-loading dialog to chat When the first token doesn't arrive within 5s of sending, surface a small non-blocking dialog explaining the model is loading into memory, with a Cancel button. The dialog tears down on the first frame (delta, tool call, or terminal event) and isn't re-armed for warm later turns. --- src/ui/Chat.svelte | 132 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 132 insertions(+) diff --git a/src/ui/Chat.svelte b/src/ui/Chat.svelte index afd4856..b4dd2f1 100644 --- a/src/ui/Chat.svelte +++ b/src/ui/Chat.svelte @@ -120,6 +120,28 @@ let input = $state(""); let streaming = $state(false); + /** Cold-start UX: Ollama loads a model's weights into RAM/VRAM on + * the first request after it was evicted (or never loaded this + * session). That gap — between firing the chat stream and the + * first token coming back — is silent today. If it runs long we + * surface a small "loading the model" dialog so the user knows the + * app isn't wedged. Once any frame arrives (delta, tool call, or + * terminal event) the model is resident and we tear the dialog + * down; we don't re-arm it for later turns in the same run since + * the model is warm by then. */ + const MODEL_LOAD_POPUP_DELAY_MS = 5000; + let modelLoading = $state(false); + let modelLoadTimer: ReturnType | null = null; + /** Clear the load-wait dialog + its arming timer. Idempotent, so + * it's safe to call from every agent event and from cleanup. */ + function clearModelLoadWait() { + if (modelLoadTimer !== null) { + clearTimeout(modelLoadTimer); + modelLoadTimer = null; + } + if (modelLoading) modelLoading = false; + } + /** One pending attachment staged for the next send. Images become * Ollama-style `images: [base64]` array entries on the user * message; text-like files (JSON, configs, source, plain text) @@ -838,6 +860,13 @@ enabledToolSet.has(t.definition.function.name), ); + // Arm the cold-start dialog: if no frame has come back by the + // delay, the model is (re)loading into memory and we tell the + // user so. clearModelLoadWait() below disarms it on first frame. + modelLoadTimer = setTimeout(() => { + modelLoading = true; + }, MODEL_LOAD_POPUP_DELAY_MS); + try { await runAgent({ messages: working, @@ -849,6 +878,9 @@ viaDevicePubkey: routeViaDevicePubkey, signal: controller.signal, onEvent: (event: AgentEvent) => { + // Any frame means the model is resident and producing — + // tear down the load-wait dialog (idempotent). + clearModelLoadWait(); switch (event.kind) { case "assistant_delta": case "thinking_delta": { @@ -916,6 +948,9 @@ streaming = false; agentAbortController = null; inFlightToolCallIds = new Set(); + // Belt-and-suspenders: if the run ended before any frame (error + // thrown, instant cancel), the timer/dialog could still be live. + clearModelLoadWait(); // Drop the streaming flag on any straggler bubble so its //
can collapse cleanly once the answer is in. if (liveIdx !== -1 && liveIdx < messages.length) { @@ -958,6 +993,9 @@ // `infer_cancel` (mesh path) on whatever turn is in flight, then // unwinds the loop without starting another round. agentAbortController?.abort(); + // Drop the load-wait dialog right away rather than waiting for the + // stream to unwind through the finally block. + clearModelLoadWait(); } function onKeydown(e: KeyboardEvent) { @@ -1392,6 +1430,28 @@ {/if} + {#if modelLoading} + + + {/if} + {#if settingsTab} Date: Fri, 29 May 2026 04:23:51 +0000 Subject: [PATCH 02/12] Add configurable keep_alive + live resource readout on load dialog MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two improvements to the model-loading experience: 1. Cold-start fix: chat requests now send Ollama a keep_alive so the model stays resident between turns instead of relying on Ollama's 5-minute default (which caused repeated cold-start reloads). The value is user-configurable in Settings > Hardware > Performance, defaulting to 30m, with options from 'unload immediately' (for memory-tight machines coexisting with transcription) to 'keep until the app quits'. Read in Rust from config so both the streaming and one-shot chat paths pick it up. 2. The model-loading dialog now shows a live CPU / RAM / GPU readout (and disk free) so the user can see why a load is slow — e.g. RAM near full means the model is paging in from disk. Reuses the existing usage_live_snapshot command and the LiveSnapshot type, now promoted to types.ts and shared with the Usage settings tab. --- src-tauri/src/ollama.rs | 23 +++ src-tauri/src/resolver.rs | 1 + src/config.ts | 1 + src/types.ts | 28 ++++ src/ui/Chat.svelte | 194 +++++++++++++++++++++++-- src/ui/settings/HardwareSection.svelte | 48 ++++++ src/ui/settings/UsageSection.svelte | 19 +-- 7 files changed, 280 insertions(+), 34 deletions(-) diff --git a/src-tauri/src/ollama.rs b/src-tauri/src/ollama.rs index 1157782..d3a62be 100644 --- a/src-tauri/src/ollama.rs +++ b/src-tauri/src/ollama.rs @@ -730,6 +730,27 @@ pub enum ChatStreamOutcome { Cancelled, } +/// Resolve the user's configured Ollama `keep_alive` for chat +/// requests. This controls how long Ollama keeps the model resident +/// in RAM/VRAM after a turn finishes: longer values avoid cold-start +/// reloads between messages (the common "why is it slow again?" +/// complaint), shorter values free memory sooner so the LLM can +/// coexist with transcription on a memory-tight machine. Accepts +/// Ollama's native duration format — "30m", "1h", "0" (unload +/// immediately), "-1" (keep until evicted). Falls back to "30m" when +/// the config is unreadable or the key is absent (older configs). +fn chat_keep_alive() -> serde_json::Value { + crate::resolver::load_config_value() + .ok() + .and_then(|c| { + c.get("ollama_keep_alive") + .and_then(|v| v.as_str()) + .map(str::to_string) + }) + .map(serde_json::Value::from) + .unwrap_or_else(|| serde_json::json!("30m")) +} + /// Streamed chat completion. Invokes `on_content` for each visible token /// chunk, `on_thinking` for any reasoning/thinking deltas (thinking models /// emit those in `message.thinking`; non-thinking models never call it), @@ -771,6 +792,7 @@ where "model": model, "messages": messages, "stream": true, + "keep_alive": chat_keep_alive(), }); if let Some(t) = think { body["think"] = serde_json::json!(t); @@ -933,6 +955,7 @@ pub async fn chat_once( "model": model, "messages": messages, "stream": false, + "keep_alive": chat_keep_alive(), }); if let Some(opts) = options { body["options"] = opts; diff --git a/src-tauri/src/resolver.rs b/src-tauri/src/resolver.rs index 9a0e92e..ee89bec 100644 --- a/src-tauri/src/resolver.rs +++ b/src-tauri/src/resolver.rs @@ -783,6 +783,7 @@ pub fn default_config_value() -> Value { "active_family": "gemma4", "active_mode": "text", "model_cleanup_days": 1, + "ollama_keep_alive": "30m", "kept_models": [], "mode_overrides": {}, "tracked_modes": ["text"], diff --git a/src/config.ts b/src/config.ts index b7bda88..5d2a3d0 100644 --- a/src/config.ts +++ b/src/config.ts @@ -111,6 +111,7 @@ const DEFAULT_CONFIG: Config = { // active_mode they persisted (mergeDefaults overlays raw on top). active_mode: "transcribe", model_cleanup_days: 1, + ollama_keep_alive: "30m", cleanup_warning_suppressed_families: [], kept_models: [], mode_overrides: {}, diff --git a/src/types.ts b/src/types.ts index 77d2fe9..98241e5 100644 --- a/src/types.ts +++ b/src/types.ts @@ -11,6 +11,27 @@ export interface HardwareProfile { soc?: string | null; } +/** Live resource snapshot — mirrors the `LiveSnapshot` struct in + * src-tauri/src/usage.rs, returned by the `usage_live_snapshot` + * Tauri command. Every counter is optional on the Rust side so the + * UI renders "—" when a platform doesn't expose it. Shared between + * the Usage settings tab and the chat model-loading dialog so both + * read the same system lookups. */ +export interface LiveSnapshot { + cpu_app_pct: number | null; + cpu_total_pct: number | null; + ram_app_bytes: number | null; + ram_total_bytes: number | null; + ram_used_bytes: number | null; + gpu_pct: number | null; + vram_app_bytes: number | null; + vram_used_bytes: number | null; + vram_total_bytes: number | null; + process_uptime_seconds: number; + cpu_brand: string | null; + cpu_count: number | null; +} + export type Mode = "text" | "vision" | "code" | "transcribe" | "diarize"; /** Runtimes the resolver knows how to dispatch to. @@ -469,6 +490,13 @@ export interface Config { active_family: string; active_mode: Mode; model_cleanup_days: number; + /** Ollama `keep_alive` for chat requests — how long the model stays + * resident in RAM/VRAM after a turn before Ollama unloads it. + * Native Ollama duration format: "30m", "1h", "0" (unload right + * away, frees memory for transcription), "-1" (keep until evicted). + * Longer values avoid cold-start reloads between messages; shorter + * values suit memory-tight machines. Default "30m". */ + ollama_keep_alive: string; /** Family names for which the user has dismissed the * "switching with auto-cleanup on" confirmation in the family * detail view's per-tier picker. Per-family rather than per-tier diff --git a/src/ui/Chat.svelte b/src/ui/Chat.svelte index b4dd2f1..4b2995d 100644 --- a/src/ui/Chat.svelte +++ b/src/ui/Chat.svelte @@ -1,5 +1,6 @@ + +
+
+

+ Tune how MyOwnLLM trades responsiveness against + load and inference speed when running models locally. + These apply to the Ollama server MyOwnLLM launches itself. +

+
+ + {#if loading} +

Loading…

+ {:else if error} +

{error}

+ {:else} +
+
+
Model memory
+ +
+
Keep model loaded
+

+ How long the chat model stays in memory after a reply. Longer + keeps later messages instant; shorter frees RAM/VRAM sooner — + handy when transcription needs to run alongside on a + memory-tight machine. +

+
+
+
Keep model loaded for
+
+ +
+
+
+
+ +
Loading
+ +
+
Load throttle
+

+ Loading a model reads gigabytes from disk, which can freeze a + laptop. This throttles those reads so the machine stays usable. + Loading is disk-bound and inference is compute-bound, so the + balanced default eases disk only and leaves token generation at + full speed. +

+
+
+
While a model loads
+
+ +
+
+
+ {#if throttleHint} +

{throttleHint}

+ {/if} +
+ +

+ Throttling only applies when MyOwnLLM starts the Ollama server + itself. If Ollama is already running as a system or tray service, + these settings don't affect it. +

+
+ +
+ {/if} +
+ + From fc832815a8cb8a079ef17750004fa65013e958a9 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 29 May 2026 05:38:10 +0000 Subject: [PATCH 08/12] Fix rustfmt formatting to unblock CI cargo fmt --check failed on two lines the earlier cargo fix / clippy --fix pass left wrapped non-canonically (embedder.rs anyhow! call and roster.rs re-export list). Reflow to rustfmt's canonical form. fmt --check, clippy --all-targets, and cargo test all pass locally on the pinned 1.88.0 toolchain. --- src-tauri/src/diarize/embedder.rs | 4 +--- src-tauri/src/mesh/roster.rs | 3 +-- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/src-tauri/src/diarize/embedder.rs b/src-tauri/src/diarize/embedder.rs index 0479030..2e9b473 100644 --- a/src-tauri/src/diarize/embedder.rs +++ b/src-tauri/src/diarize/embedder.rs @@ -234,9 +234,7 @@ impl Embedder { // is correct for both shapes since the leading axes are 1. let shape = view.shape().to_vec(); if shape.last().copied().unwrap_or(0) == 0 { - return Err(anyhow!( - "embedder produced zero-length output ({shape:?})" - )); + return Err(anyhow!("embedder produced zero-length output ({shape:?})")); } let mut out: Vec = view.iter().copied().collect(); l2_normalize(&mut out); diff --git a/src-tauri/src/mesh/roster.rs b/src-tauri/src/mesh/roster.rs index efa84d4..44a60be 100644 --- a/src-tauri/src/mesh/roster.rs +++ b/src-tauri/src/mesh/roster.rs @@ -23,8 +23,7 @@ //! over. pub use myownmesh_core::roster::{ - add_peer, delete, load, remove_peer, - save, AuthorizedPeer, Roster, ROSTER_VERSION, + add_peer, delete, load, remove_peer, save, AuthorizedPeer, Roster, ROSTER_VERSION, }; /// One-shot migration from the pre-multi-network single roster file From a34467dcdeb8acd9a32751c45619483b95d393c8 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 29 May 2026 06:21:47 +0000 Subject: [PATCH 09/12] Throttle with a moderate nice; restore warm-on-startup default MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The io throttle was applied post-spawn via taskpolicy -p, which is a no-op on macOS, so the server ran unthrottled and a load could starve the display/networking and freeze the machine. And the previous fix left the CPU fully open to the server (IO-only), which is what starved the system in the first place. Fix: throttle at launch with a moderate 0 so the server yields CPU to the system (display, networking, WebView) when they need it, but still gets the bulk of the cores when nothing competes — responsive machine, inference not crippled. Applied as an argv prefix (nice execs the target), which is also the only reliable way to set macOS IO policy. - io (balanced, default): nice -n 10 (+ low best-effort ionice on Linux). - aggressive: nice -n 19 + idle ionice (Linux) / background QoS (macOS). - Windows: post-spawn priority class (BelowNormal / Idle). - Fallback to a direct spawn if the wrapper can't bring Ollama up, so a missing/incompatible tool never disables the LLM. Restore warm_on_startup to default ON (the load now runs under the throttle, so it won't lock up the machine); it remains a toggle in Settings → Performance. --- src-tauri/src/ollama.rs | 121 ++++++++++++++------ src-tauri/src/process.rs | 129 ++++++++++++---------- src-tauri/src/resolver.rs | 1 + src/config.ts | 1 + src/types.ts | 5 + src/ui/App.svelte | 20 ++-- src/ui/settings/PerformanceSection.svelte | 45 +++++++- 7 files changed, 219 insertions(+), 103 deletions(-) diff --git a/src-tauri/src/ollama.rs b/src-tauri/src/ollama.rs index 17093b1..c89bf65 100644 --- a/src-tauri/src/ollama.rs +++ b/src-tauri/src/ollama.rs @@ -278,51 +278,108 @@ pub async fn ensure_running() -> Result<()> { return Ok(()); } - // OLLAMA_ORIGINS=* belt-and-suspenders: when WE spawn the server (e.g. Linux - // or a fresh standalone Windows install), this lets the GUI fetch directly - // from `http://127.0.0.1:11434` without Ollama's CORS allowlist rejecting - // the WebView's `Origin` (which on Tauri 2 / Windows is `http://tauri.localhost`, - // not in Ollama's defaults). When the Windows installer runs Ollama as a - // tray service we can't influence its env — that's why the GUI also routes - // chat through myownllm's API server (see Chat.svelte). - let child = quiet_tokio_command("ollama") - .arg("serve") - .env("OLLAMA_ORIGINS", "*") - // Cap memory pressure on the laptop-class machines that freeze - // hard while a model pages in: keep at most one model resident, - // and serve one request at a time, so Ollama never tries to - // hold two models (or N parallel KV caches) in RAM/VRAM at once. - .env("OLLAMA_MAX_LOADED_MODELS", "1") - .env("OLLAMA_NUM_PARALLEL", "1") - .stdout(Stdio::null()) - .stderr(Stdio::null()) - .spawn() - .context("failed to spawn ollama serve")?; - - // Throttle the server we just spawned so the disk thrash of loading - // a model doesn't lock up the whole desktop. The mode is user-tunable - // (Settings → Performance); "off" skips it entirely. Best-effort, and - // only possible because WE own this process — when Ollama is already - // running as a system/tray service we never reach this branch. + // Throttle the server at launch (Settings → Performance). Applying it + // as an argv prefix is the reliable lever — notably on macOS, where + // taskpolicy's IO policy only takes effect when launching a program, + // not via `-p` on a running PID (that gap left the throttle a no-op + // and let a load thrash the machine). "off" yields no prefix. let mode = throttle_mode(); + let prefix = crate::process::throttle_launch_prefix(&mode); + + // Spawn under the wrapper when we have one. If the wrapper binary is + // missing the spawn itself errors — fall back to a plain spawn so a + // missing throttle tool can never leave the app without an LLM. + let child = match spawn_ollama_serve(prefix.as_deref()) { + Ok(c) => c, + Err(_) if prefix.is_some() => { + spawn_ollama_serve(None).context("failed to spawn ollama serve")? + } + Err(e) => return Err(anyhow::Error::new(e).context("failed to spawn ollama serve")), + }; + *guard = Some(child); + + // Windows throttles post-spawn (no reliable launch wrapper there); + // Unix already throttled via the argv prefix above. + #[cfg(target_os = "windows")] if mode != "off" { - if let Some(pid) = child.id() { - crate::process::lower_priority(pid, &mode).await; + if let Some(pid) = guard.as_ref().and_then(|c| c.id()) { + crate::process::set_priority_windows(pid, &mode).await; } } - *guard = Some(child); - - // Wait up to 10 seconds for API to become reachable. + // Wait up to 10s for the API. Bail early if a launch-wrapper child + // exited without exec'ing ollama (e.g. an unsupported flag on this OS + // version): try_wait → Some means the wrapper failed, so we stop + // waiting and retry directly below rather than burning the full 10s. + let mut up = false; for _ in 0..20 { tokio::time::sleep(std::time::Duration::from_millis(500)).await; if api_reachable().await { - return Ok(()); + up = true; + break; + } + if prefix.is_some() { + if let Some(c) = guard.as_mut() { + if matches!(c.try_wait(), Ok(Some(_))) { + break; // wrapper died without bringing ollama up + } + } + } + } + if up { + return Ok(()); + } + + // Wrapped launch never came up — retry with a direct spawn so a + // broken/incompatible throttle tool can't disable the LLM entirely. + if prefix.is_some() { + if let Some(mut dead) = guard.take() { + let _ = dead.kill().await; + } + let direct = spawn_ollama_serve(None).context("failed to spawn ollama serve")?; + *guard = Some(direct); + for _ in 0..20 { + tokio::time::sleep(std::time::Duration::from_millis(500)).await; + if api_reachable().await { + return Ok(()); + } } } Err(anyhow!("ollama serve did not become reachable within 10s")) } +/// Spawn `ollama serve` with our standard env, optionally under a +/// launch-time throttle wrapper (`prefix`, e.g. `["taskpolicy", "-d", +/// "throttle"]`). The wrapper tools exec their target, so the child PID +/// is ollama itself — kill-on-exit (`stop`) is unaffected. +/// +/// OLLAMA_ORIGINS=* is belt-and-suspenders: when WE spawn the server this +/// lets the GUI fetch directly from `http://127.0.0.1:11434` without +/// Ollama's CORS allowlist rejecting the WebView's `Origin`. The memory +/// caps keep at most one model resident and serve one request at a time, +/// so Ollama never tries to hold two models (or N parallel KV caches) in +/// RAM/VRAM at once — the swap thrash behind the hardest freezes. +fn spawn_ollama_serve(prefix: Option<&[&str]>) -> std::io::Result { + let mut cmd = match prefix { + Some(p) if !p.is_empty() => { + let mut c = quiet_tokio_command(p[0]); + c.args(&p[1..]).arg("ollama").arg("serve"); + c + } + _ => { + let mut c = quiet_tokio_command("ollama"); + c.arg("serve"); + c + } + }; + cmd.env("OLLAMA_ORIGINS", "*") + .env("OLLAMA_MAX_LOADED_MODELS", "1") + .env("OLLAMA_NUM_PARALLEL", "1") + .stdout(Stdio::null()) + .stderr(Stdio::null()) + .spawn() +} + async fn api_reachable() -> bool { reqwest_get("http://127.0.0.1:11434/").await.is_ok() } diff --git a/src-tauri/src/process.rs b/src-tauri/src/process.rs index e2db2ae..7e84356 100644 --- a/src-tauri/src/process.rs +++ b/src-tauri/src/process.rs @@ -48,77 +48,86 @@ fn apply_quiet_flags_tokio(cmd: &mut tokio::process::Command) { #[cfg(not(target_os = "windows"))] fn apply_quiet_flags_tokio(_cmd: &mut tokio::process::Command) {} -/// Best-effort: ease a child process's priority so the heavy reads when -/// an LLM server pages multi-GB weights in on first use don't starve the -/// desktop. `mode` controls how hard: +/// Launch-time throttle wrapper for the LLM server, per the user's `mode` +/// ("io" | "aggressive"; "off" → `None`). Returned as an argv prefix to +/// prepend before `ollama serve` rather than something we apply to the +/// PID after spawn — that distinction matters on macOS, where +/// `taskpolicy`'s disk-IO policy only takes effect when *launching* a +/// program, not via `-p` on a running one. Applying it post-spawn was a +/// silent no-op, which left the server unthrottled and let a model load +/// thrash the whole machine. The wrapper tools (`ionice`/`nice`/ +/// `taskpolicy`) exec their target, so the resulting child PID is still +/// ollama and our kill-on-exit handling is unaffected. /// -/// - `"io"` (default): lower **disk IO** priority only. Model loading is -/// disk-bound and inference is compute-bound, so this keeps the machine -/// responsive during a load without kneecapping token generation. -/// - `"aggressive"`: also demote CPU/QoS. Most responsive desktop during -/// a load, but inference itself runs slower. +/// - `"io"` (default, "balanced"): a moderate `nice` (CPU) — plus a low +/// disk-IO class on Linux. `nice` only makes the server yield when +/// something else (the display server, networking, the WebView) wants +/// the CPU, so the machine stays responsive during a heavy load while +/// inference still gets the bulk of the cores when nothing competes. +/// Crucially it does NOT leave the CPU wide open to the server — which +/// is what starved the desktop and froze the machine — nor force it +/// onto efficiency cores like background QoS, so inference isn't +/// crippled. +/// - `"aggressive"`: deep `nice` / background QoS — most responsive +/// desktop during a load, but noticeably slower inference. /// -/// (`"off"` is handled by the caller, which simply doesn't call this.) -/// Every call is fire-and-forget: a missing tool or permission error just -/// means no throttle, never a hard failure. -#[allow(unused_variables)] // `pid`/`mode` are unused on platforms without a branch -pub async fn lower_priority(pid: u32, mode: &str) { - let pid = pid.to_string(); +/// `None` on Windows (it throttles post-spawn via [`set_priority_windows`] +/// instead) and for `"off"`. +pub fn throttle_launch_prefix(mode: &str) -> Option> { + if mode == "off" { + return None; + } let aggressive = mode == "aggressive"; #[cfg(target_os = "linux")] { - if aggressive { - // Idle IO class (only runs when nothing else wants the disk) - // plus a CPU nice — maximum desktop responsiveness, slower - // load and inference. - let _ = quiet_tokio_command("ionice") - .args(["-c", "3", "-p", &pid]) - .status() - .await; - let _ = quiet_tokio_command("renice") - .args(["-n", "5", "-p", &pid]) - .status() - .await; + Some(if aggressive { + // Max nice + idle IO class — server only runs when nothing + // else wants CPU or disk. Snappiest desktop, slowest model. + vec!["nice", "-n", "19", "ionice", "-c", "3"] } else { - // Best-effort IO class, lowest priority (7): still gets disk - // time but yields under contention. IO-only — no renice, so - // inference keeps full CPU once loaded. - let _ = quiet_tokio_command("ionice") - .args(["-c", "2", "-n", "7", "-p", &pid]) - .status() - .await; - } + // Moderate nice so the system keeps headroom, plus low + // best-effort IO so disk reads yield under contention. The + // server still gets most of the CPU when it's the only thing + // running, so inference stays fast. + vec!["nice", "-n", "10", "ionice", "-c", "2", "-n", "7"] + }) } #[cfg(target_os = "macos")] { - if aggressive { - // Background QoS: demotes to efficiency cores and throttles - // both compute and IO. Frees the machine most, but slows - // inference noticeably. - let _ = quiet_tokio_command("taskpolicy") - .args(["-b", "-p", &pid]) - .status() - .await; + Some(if aggressive { + // Background QoS: efficiency cores + throttled compute & IO. + // Frees the machine most, but slows inference. + vec!["taskpolicy", "-b"] } else { - // Disk IO policy "throttle" (IOPOL_THROTTLE) only — leaves CPU - // scheduling and QoS untouched, so inference runs on the - // performance cores at full speed. - let _ = quiet_tokio_command("taskpolicy") - .args(["-d", "throttle", "-p", &pid]) - .status() - .await; - } + // Moderate nice only — reserves CPU headroom for the system + // (display, networking) while leaving the server on the + // performance cores, so inference isn't kneecapped. `nice` is + // POSIX and always present, so this can't fail the launch. + vec!["nice", "-n", "10"] + }) } - #[cfg(target_os = "windows")] + #[cfg(not(any(target_os = "linux", target_os = "macos")))] { - // Windows doesn't expose per-process IO priority to other - // processes without FFI; we nudge the priority class instead — - // BelowNormal for the IO tier, Idle (lowest) when aggressive. - let class = if aggressive { "Idle" } else { "BelowNormal" }; - let script = format!("(Get-Process -Id {pid}).PriorityClass='{class}'"); - let _ = quiet_tokio_command("powershell") - .args(["-NoProfile", "-NonInteractive", "-Command", &script]) - .status() - .await; + let _ = aggressive; // consumed on platforms without a launch wrapper + None } } + +/// Windows-only: ease the spawned server's priority class after spawn. +/// Windows has no launch-time IO-throttle wrapper we can rely on and no +/// external per-process IO priority without FFI, so we nudge the priority +/// class instead — `BelowNormal` for the default, `Idle` when aggressive. +/// Best-effort; failure just means no throttle. +#[cfg(target_os = "windows")] +pub async fn set_priority_windows(pid: u32, mode: &str) { + let class = if mode == "aggressive" { + "Idle" + } else { + "BelowNormal" + }; + let script = format!("(Get-Process -Id {pid}).PriorityClass='{class}'"); + let _ = quiet_tokio_command("powershell") + .args(["-NoProfile", "-NonInteractive", "-Command", &script]) + .status() + .await; +} diff --git a/src-tauri/src/resolver.rs b/src-tauri/src/resolver.rs index 95fbb7e..84d53f4 100644 --- a/src-tauri/src/resolver.rs +++ b/src-tauri/src/resolver.rs @@ -785,6 +785,7 @@ pub fn default_config_value() -> Value { "model_cleanup_days": 1, "ollama_keep_alive": "30m", "ollama_throttle": "io", + "warm_on_startup": true, "kept_models": [], "mode_overrides": {}, "tracked_modes": ["text"], diff --git a/src/config.ts b/src/config.ts index ade2a2f..43235c6 100644 --- a/src/config.ts +++ b/src/config.ts @@ -113,6 +113,7 @@ const DEFAULT_CONFIG: Config = { model_cleanup_days: 1, ollama_keep_alive: "30m", ollama_throttle: "io", + warm_on_startup: true, cleanup_warning_suppressed_families: [], kept_models: [], mode_overrides: {}, diff --git a/src/types.ts b/src/types.ts index 7f064ca..ed92052 100644 --- a/src/types.ts +++ b/src/types.ts @@ -504,6 +504,11 @@ export interface Config { * desktop, slower inference). Only applies when MyOwnLLM spawns Ollama * itself — not when it's a system/tray service. */ ollama_throttle: "off" | "io" | "aggressive"; + /** Warm (preload) the active chat model in the background at startup so + * the first message doesn't pay the cold-load wait. On by default; the + * load runs under the configured throttle so it doesn't lock up the + * machine. Can be turned off in Settings → Performance. */ + warm_on_startup: boolean; /** Family names for which the user has dismissed the * "switching with auto-cleanup on" confirmation in the family * detail view's per-tier picker. Per-family rather than per-tier diff --git a/src/ui/App.svelte b/src/ui/App.svelte index 17871c0..c83b85a 100644 --- a/src/ui/App.svelte +++ b/src/ui/App.svelte @@ -277,13 +277,19 @@ view = "chat"; invoke("ollama_ensure_running").catch(() => {}); - // Proactively warm the chat model in the background so its - // one-time cold load happens now — with the throttled server - // keeping the machine responsive — rather than freezing the user - // on their first message. Skipped when the model isn't on disk - // yet (the download overlay owns that flow) or when keep_alive is - // "0" (warming would just load-then-unload). Fire-and-forget. - if (pendingTextModel && !textModelMissing && config.ollama_keep_alive !== "0") { + // Warm the chat model in the background so the first message doesn't + // pay the cold-load wait. On by default; the load runs under the + // configured throttle (Settings → Performance) so it doesn't lock up + // the machine. Skipped when the user turned it off, when the model + // isn't on disk yet (the download overlay owns that), or when + // keep_alive is "0" (warming would just load-then-unload). + // Fire-and-forget. + if ( + config.warm_on_startup !== false && + pendingTextModel && + !textModelMissing && + config.ollama_keep_alive !== "0" + ) { invoke("ollama_warm", { model: pendingTextModel }).catch(() => {}); } diff --git a/src/ui/settings/PerformanceSection.svelte b/src/ui/settings/PerformanceSection.svelte index 95a4d94..54d939c 100644 --- a/src/ui/settings/PerformanceSection.svelte +++ b/src/ui/settings/PerformanceSection.svelte @@ -26,25 +26,30 @@ { value: "off", label: "Off (fastest load)", - hint: "No throttle. Loads fastest, but a big model can briefly bog down the whole machine.", + hint: "No throttle. Loads fastest, but a big model can saturate the CPU and briefly freeze the machine.", }, { value: "io", - label: "Balanced — disk only (recommended)", - hint: "Eases disk priority during the load so the machine stays responsive, while inference keeps running at full speed.", + label: "Balanced (recommended)", + hint: "Lowers the model's priority a notch so the system — display, networking — keeps enough CPU to stay responsive during a load, while inference still gets the bulk of the cores.", }, { value: "aggressive", label: "Aggressive (most responsive)", - hint: "Also lowers CPU priority. Keeps the desktop snappiest during a load, but token generation runs slower.", + hint: "Deeply deprioritizes the model. Keeps the desktop snappiest during a load, but token generation runs noticeably slower.", }, ]; + /** Preload the chat model at startup so the first message is instant. + * On by default; the load runs under the throttle above. */ + let warmOnStartup = $state(true); + onMount(async () => { try { const config = await loadConfig(); keepAlive = config.ollama_keep_alive ?? "30m"; throttle = (config.ollama_throttle ?? "io") as Throttle; + warmOnStartup = config.warm_on_startup ?? true; } catch (e) { error = String(e); } finally { @@ -52,6 +57,11 @@ } }); + async function patchWarmOnStartup(value: boolean) { + warmOnStartup = value; + await updateConfig({ warm_on_startup: value }); + } + async function patchKeepAlive(value: string) { keepAlive = value; await updateConfig({ ollama_keep_alive: value }); @@ -144,6 +154,23 @@ {/if} +
+
Warm at startup
+

+ Preload the chat model in the background when the app starts, so + your first message doesn't wait for it to load. The load runs + under the throttle above, so it won't lock up the machine. +

+ +
+

Throttling only applies when MyOwnLLM starts the Ollama server itself. If Ollama is already running as a system or tray service, @@ -204,4 +231,14 @@ select:focus { outline: none; border-color: #6e6ef7; } .footnote { font-size: .72rem; color: #555; line-height: 1.5; padding: .35rem .15rem 0; margin: 0; } + + .toggle { + display: inline-flex; + align-items: center; + gap: .45rem; + font-size: .82rem; + color: #ccc; + cursor: pointer; + } + .toggle input { accent-color: #6e6ef7; } From 67562ee7f4941f8681396419ae6d39b6d663e926 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 29 May 2026 06:35:39 +0000 Subject: [PATCH 10/12] Make the cold-start indicator inline, not a modal MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the floating load dialog with an in-bubble indicator that takes the place of the typing dots while the model loads — no jolting overlay. Minimal prose: a reassurance word that rotates every 3s with a moving shine (recreated per change so it fades in), plus a quiet live CPU/RAM line as proof the machine is still working. The composer's Stop button already covers cancel, so the modal's Cancel/heading/spinner are gone. --- src/ui/Chat.svelte | 280 ++++++++++++++------------------------------- 1 file changed, 84 insertions(+), 196 deletions(-) diff --git a/src/ui/Chat.svelte b/src/ui/Chat.svelte index e905a4f..7a1558c 100644 --- a/src/ui/Chat.svelte +++ b/src/ui/Chat.svelte @@ -134,6 +134,32 @@ let modelLoading = $state(false); let modelLoadTimer: ReturnType | null = null; + /** Reassurance words for the cold-start indicator — cycled every few + * seconds (with a shine) so the user can see something is alive while + * the model loads. Short and low-key; the rotation itself is the + * "still working, not frozen" signal. */ + const LOADING_WORDS = [ + "Working on it…", + "Loading the model…", + "Warming up…", + "Reading the weights…", + "Getting set up…", + "Hang tight…", + "Almost there…", + ]; + const LOADING_WORD_MS = 3000; + let loadingWordIdx = $state(0); + // Rotate the reassurance word while a load is in progress. The $effect's + // cleanup clears the interval the moment `modelLoading` goes false. + $effect(() => { + if (!modelLoading) return; + loadingWordIdx = 0; + const id = setInterval(() => { + loadingWordIdx = (loadingWordIdx + 1) % LOADING_WORDS.length; + }, LOADING_WORD_MS); + return () => clearInterval(id); + }); + /** Live CPU/RAM/GPU snapshot shown inside the load-wait dialog so * the user can see *why* it's slow (e.g. RAM near full → the model * is paging in from disk). Reuses the same `usage_live_snapshot` @@ -205,11 +231,6 @@ if (bytes == null) return "—"; return `${(bytes / 1024 ** 3).toFixed(1)} GB`; } - /** RAM-in-use percentage for the mini bar, or null when unknown. */ - function ramUsedPct(): number | null { - if (!liveStats?.ram_used_bytes || !liveStats?.ram_total_bytes) return null; - return (liveStats.ram_used_bytes / liveStats.ram_total_bytes) * 100; - } /** One pending attachment staged for the next send. Images become * Ollama-style `images: [base64]` array entries on the user @@ -1383,7 +1404,30 @@ {/each} {#if streaming && (messages.length === 0 || messages[messages.length - 1].role !== "assistant")}

-
+
+ {#if modelLoading} + +
+ {#key loadingWordIdx} + {LOADING_WORDS[loadingWordIdx]} + {/key} + {#if !routeViaDevicePubkey && liveStats} + + {#if liveStats.cpu_total_pct != null}CPU {Math.round(liveStats.cpu_total_pct)}%{/if} + {#if liveStats.ram_used_bytes != null && liveStats.ram_total_bytes != null} + · RAM {fmtGb(liveStats.ram_used_bytes)}/{fmtGb(liveStats.ram_total_bytes)} + {/if} + + {/if} +
+ {:else} + + {/if} +
{/if} @@ -1517,74 +1561,6 @@ {/if} - {#if modelLoading} - - - {/if} - {#if settingsTab} Date: Fri, 29 May 2026 06:45:58 +0000 Subject: [PATCH 11/12] Hold a loading screen during startup warm; share the indicator MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extract the cold-start indicator (rotating shining word + live CPU/RAM) into a reusable LoadingPulse component and use it in two places: - In chat: still shown in place of the typing dots whenever a call is slow (cold load or a long-running turn) — unchanged behavior, now via the shared component. - At startup: when warm-on-startup runs, hold a full-screen loading screen (spinner + LoadingPulse beneath it) over the chat until the model is resident, instead of dropping into a chat that feels sluggish while it competes with the cold load. The chat still mounts behind the screen, so it's ready the moment the screen lifts; a Continue button is the escape hatch. LoadingPulse self-manages its word rotation and usage poll (mount/ unmount lifecycle), so Chat no longer hand-rolls those. --- src/ui/App.svelte | 77 +++++++++++++++-- src/ui/Chat.svelte | 172 +++++-------------------------------- src/ui/LoadingPulse.svelte | 113 ++++++++++++++++++++++++ 3 files changed, 204 insertions(+), 158 deletions(-) create mode 100644 src/ui/LoadingPulse.svelte diff --git a/src/ui/App.svelte b/src/ui/App.svelte index c83b85a..75f6a26 100644 --- a/src/ui/App.svelte +++ b/src/ui/App.svelte @@ -7,6 +7,7 @@ import Chat from "./Chat.svelte"; import TranscribeView from "./TranscribeView.svelte"; import Sidebar from "./Sidebar.svelte"; + import LoadingPulse from "./LoadingPulse.svelte"; import PermissionPromptModal from "./PermissionPromptModal.svelte"; import { loadConfig, updateConfig } from "../config"; import { getActiveManifest } from "../providers"; @@ -83,6 +84,12 @@ type View = "loading" | "chat"; let view = $state("loading"); + /** True while the startup warm is in flight: we keep a full-screen + * loading screen up (rather than dropping into a chat that feels + * sluggish because it's competing with the cold load) until the model + * is resident. The chat still mounts behind it, so it's fully ready + * when the screen lifts. A "Continue" button is the escape hatch. */ + let warming = $state(false); let appVersion = $state(""); let hardware = $state(null); let activeModel = $state(""); @@ -277,20 +284,28 @@ view = "chat"; invoke("ollama_ensure_running").catch(() => {}); - // Warm the chat model in the background so the first message doesn't - // pay the cold-load wait. On by default; the load runs under the - // configured throttle (Settings → Performance) so it doesn't lock up - // the machine. Skipped when the user turned it off, when the model - // isn't on disk yet (the download overlay owns that), or when - // keep_alive is "0" (warming would just load-then-unload). - // Fire-and-forget. + // Warm the chat model so the first message doesn't pay the cold-load + // wait. On by default; the load runs under the configured throttle + // (Settings → Performance). Skipped when the user turned it off, when + // the model isn't on disk yet (the download overlay owns that), or + // when keep_alive is "0" (warming would just load-then-unload). + // + // We hold a full-screen loading screen (`warming`) over the chat — + // which keeps mounting/initializing behind it — until the warm + // settles, so the user lands on a chat that's actually ready instead + // of one that feels sluggish while it competes with the cold load. if ( config.warm_on_startup !== false && pendingTextModel && !textModelMissing && config.ollama_keep_alive !== "0" ) { - invoke("ollama_warm", { model: pendingTextModel }).catch(() => {}); + warming = true; + invoke("ollama_warm", { model: pendingTextModel }) + .catch(() => {}) + .finally(() => { + warming = false; + }); } kickUpdateCheck(); @@ -1160,6 +1175,23 @@ the same modal; the modal self-hides when the prompt queue drains. --> + + {#if warming} + +
+
+ + + {#if appVersion} +

v{appVersion}

+ {/if} +
+ {/if} From 2fe04a39275e0980c3541406fb2a02ba08770c0f Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 29 May 2026 06:49:16 +0000 Subject: [PATCH 12/12] Make loading indicator phrases ambiguous The indicator covers both a cold model load and a slow in-progress turn, so model-specific phrases (Loading the model / Reading the weights / Warming up) wrongly implied a reload mid-chat. Swap for neutral 'work is underway' phrases that fit either case. --- src/ui/LoadingPulse.svelte | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/ui/LoadingPulse.svelte b/src/ui/LoadingPulse.svelte index fdff5f2..cbbf92d 100644 --- a/src/ui/LoadingPulse.svelte +++ b/src/ui/LoadingPulse.svelte @@ -10,13 +10,17 @@ let { showStats = true }: { showStats?: boolean } = $props(); + // Deliberately ambiguous about *what's* happening: this same indicator + // covers both a cold model load and a slow in-progress turn, so phrases + // like "Loading the model…" would wrongly suggest a reload mid-chat. + // These just reassure that work is underway, whatever the cause. const WORDS = [ "Working on it…", - "Loading the model…", - "Warming up…", - "Reading the weights…", - "Getting set up…", + "Thinking it through…", + "Crunching…", "Hang tight…", + "Still working…", + "Just a moment…", "Almost there…", ]; const WORD_MS = 3000;