diff --git a/src-tauri/src/asr/moonshine.rs b/src-tauri/src/asr/moonshine.rs index df1e59b..201c159 100644 --- a/src-tauri/src/asr/moonshine.rs +++ b/src-tauri/src/asr/moonshine.rs @@ -673,7 +673,7 @@ impl MoonshineBackend { let arr = if use_cache { kv.values[idx] .as_ref() - .ok_or_else(|| anyhow!("KV cache slot {} unpopulated under use_cache", idx))? + .ok_or_else(|| anyhow!("KV cache slot {idx} unpopulated under use_cache"))? .clone() } else { let resolved_shape: Vec = past @@ -736,7 +736,7 @@ impl MoonshineBackend { .map_err(|e| anyhow!("ort extract logits: {e}"))?; let shape = logits_view.shape().to_vec(); if shape.len() != 3 || shape[0] != 1 { - return Err(anyhow!("unexpected decoder logits shape {:?}", shape)); + return Err(anyhow!("unexpected decoder logits shape {shape:?}")); } let last = shape[1] - 1; let vocab = shape[2]; diff --git a/src-tauri/src/asr/parakeet.rs b/src-tauri/src/asr/parakeet.rs index d810e05..db156f6 100644 --- a/src-tauri/src/asr/parakeet.rs +++ b/src-tauri/src/asr/parakeet.rs @@ -127,7 +127,7 @@ impl AsrBackend for ParakeetBackend { .map_err(|e| anyhow!("ort threads: {e}"))? .commit_from_file(&model_path_owned) .map_err(|e| anyhow!("loading {}: {e}", model_path_owned.display())) - .with_context(|| format!("warm_up parakeet {}", model_name_owned)) + .with_context(|| format!("warm_up parakeet {model_name_owned}")) })?; // Sniff I/O names. NeMo's istupakov export uses diff --git a/src-tauri/src/cli.rs b/src-tauri/src/cli.rs index fd69db0..a13bd85 100644 --- a/src-tauri/src/cli.rs +++ b/src-tauri/src/cli.rs @@ -1052,13 +1052,10 @@ async fn cmd_fetch_onnxruntime() -> Result<()> { let pct = (bytes as f64 / total as f64 * 100.0) as u64; let _ = write!( std::io::stderr(), - "\r {:>3}% {:>10} / {:>10} bytes", - pct, - bytes, - total + "\r {pct:>3}% {bytes:>10} / {total:>10} bytes" ); } else { - let _ = write!(std::io::stderr(), "\r {:>10} bytes", bytes); + let _ = write!(std::io::stderr(), "\r {bytes:>10} bytes"); } let _ = std::io::stderr().flush(); }); diff --git a/src-tauri/src/diarize/embedder.rs b/src-tauri/src/diarize/embedder.rs index 88d7373..2e9b473 100644 --- a/src-tauri/src/diarize/embedder.rs +++ b/src-tauri/src/diarize/embedder.rs @@ -234,10 +234,7 @@ impl Embedder { // is correct for both shapes since the leading axes are 1. let shape = view.shape().to_vec(); if shape.last().copied().unwrap_or(0) == 0 { - return Err(anyhow!( - "embedder produced zero-length output ({:?})", - shape - )); + return Err(anyhow!("embedder produced zero-length output ({shape:?})")); } let mut out: Vec = view.iter().copied().collect(); l2_normalize(&mut out); diff --git a/src-tauri/src/diarize/segmenter.rs b/src-tauri/src/diarize/segmenter.rs index 27a09f0..4c323da 100644 --- a/src-tauri/src/diarize/segmenter.rs +++ b/src-tauri/src/diarize/segmenter.rs @@ -227,8 +227,7 @@ impl Segmenter { let shape = logits.shape().to_vec(); if shape.len() != 3 || shape[0] != 1 || shape[2] != 7 { return Err(anyhow!( - "unexpected segmenter output shape {:?} (want [1, T, 7])", - shape + "unexpected segmenter output shape {shape:?} (want [1, T, 7])" )); } let t_frames = shape[1]; diff --git a/src-tauri/src/hardware.rs b/src-tauri/src/hardware.rs index 90ff2ad..e471262 100644 --- a/src-tauri/src/hardware.rs +++ b/src-tauri/src/hardware.rs @@ -177,6 +177,7 @@ fn read_proc_meminfo_total_gb() -> Option { } /// Pulled out for testability. Reads the `MemTotal: NNN kB` line. +#[cfg_attr(not(target_os = "linux"), allow(dead_code))] // Linux detect() + tests only fn parse_meminfo_total_kb(content: &str) -> Option { for line in content.lines() { if let Some(rest) = line.strip_prefix("MemTotal:") { @@ -262,6 +263,7 @@ fn df_k_gb(path: &str) -> Option { /// rows; field 4 is `Available` in 1K blocks. Some `df` flavours wrap long /// device names onto a second line, so the available column may be on the /// row after the device name. Find the first row with a numeric column 4. +#[cfg_attr(not(target_os = "linux"), allow(dead_code))] // Linux detect() + tests only fn parse_df_avail_kb(out: &str) -> Option { for line in out.lines().skip(1) { let parts: Vec<&str> = line.split_whitespace().collect(); @@ -303,6 +305,7 @@ fn detect_soc_label() -> Option { None } +#[cfg_attr(not(target_os = "linux"), allow(dead_code))] // Linux detect() + tests only fn parse_device_tree_model(raw: &[u8]) -> Option { // Trim trailing NUL bytes the kernel attaches to the device-tree string. let end = raw.iter().position(|b| *b == 0).unwrap_or(raw.len()); @@ -313,6 +316,7 @@ fn parse_device_tree_model(raw: &[u8]) -> Option { Some(s.to_string()) } +#[cfg_attr(not(target_os = "linux"), allow(dead_code))] // Linux detect() + tests only fn parse_cpuinfo_model(content: &str) -> Option { // ARM kernels emit `Model : Raspberry Pi 5 Model B Rev 1.0` and/or // `Hardware : BCM2712`. Prefer the human-friendly Model line. diff --git a/src-tauri/src/main.rs b/src-tauri/src/main.rs index 321d46e..35a04e8 100644 --- a/src-tauri/src/main.rs +++ b/src-tauri/src/main.rs @@ -73,6 +73,25 @@ async fn ollama_list_models() -> Result, String> { ollama::list_models().await.map_err(|e| e.to_string()) } +/// True when `model` is already resident in Ollama's memory, so the +/// next chat won't cold-load. The chat UI uses this to decide whether +/// to paint the load dialog *before* firing the request (a cold load +/// can thrash the machine hard enough that a delayed dialog never +/// renders). Best-effort: returns false if Ollama can't be reached. +#[tauri::command] +async fn ollama_model_loaded(model: String) -> bool { + ollama::is_model_loaded(&model).await +} + +/// Proactively load `model` into memory at the throttled server's low +/// priority, so the one-time cold load happens at a predictable moment +/// (e.g. just after launch) instead of freezing the user mid-chat. +#[tauri::command] +async fn ollama_warm(model: String) -> Result<(), String> { + ollama::ensure_running().await.map_err(|e| e.to_string())?; + ollama::warm(&model).await.map_err(|e| e.to_string()) +} + #[tauri::command] async fn ollama_delete_model(name: String) -> Result<(), String> { ollama::delete_model(&name).await.map_err(|e| e.to_string()) @@ -632,7 +651,7 @@ fn mesh_file_save_at(path: String, bytes_b64: String) -> Result<(), String> { } let target = std::path::PathBuf::from(&path); if target.is_dir() { - return Err(format!("target {} is a directory", path)); + return Err(format!("target {path} is a directory")); } // Best-effort: make sure the parent directory exists. The // save-dialog typically lands the user inside an existing folder, @@ -1005,6 +1024,8 @@ fn main() { ollama_install, ollama_stop, ollama_list_models, + ollama_model_loaded, + ollama_warm, ollama_delete_model, preload_modes, ensure_tracked_models, diff --git a/src-tauri/src/mesh/daemon.rs b/src-tauri/src/mesh/daemon.rs index 033c893..53fc88a 100644 --- a/src-tauri/src/mesh/daemon.rs +++ b/src-tauri/src/mesh/daemon.rs @@ -532,25 +532,28 @@ impl Drop for DaemonChild { /// dev cycle rewrites with fresh content instead of perpetually /// staging corrupt bits. Files at paths we DON'T own (PATH /// lookup, env-var override) just get skipped without deletion. -fn looks_like_executable(path: &Path) -> bool { +/// Validate a candidate daemon binary. `Ok(())` when usable, else +/// `Err(reason)` describing why it was rejected. Performs the self-heal +/// removal of a stale *owned* slot, but does NOT log — callers collect +/// the reasons and surface them only when the whole search fails, so a +/// successful daemon launch stays quiet (see `ensure_daemon_running`). +fn check_executable(path: &Path) -> Result<(), String> { match validate_path_is_executable(path) { - Ok(()) => true, + Ok(()) => Ok(()), Err(reason) => { - // Self-heal: if this is a known-owned slot and the - // content is invalid, delete it. Tauri's externalBin - // staging will regenerate from the source on the - // next build; the source's own validator catches - // problems before propagating them. + // Self-heal: if this is a known-owned slot and the content is + // invalid, delete it. Tauri's externalBin staging regenerates + // it from the source on the next build; the source's own + // validator catches problems before propagating them. if is_owned_slot(path) { - eprintln!( - "daemon: {} failed executable check ({reason}); removing stale file", - path.display() - ); let _ = std::fs::remove_file(path); + Err(format!( + "{} failed executable check ({reason}); removed stale file", + path.display() + )) } else { - eprintln!("daemon: skipping {} ({reason})", path.display()); + Err(format!("skipping {} ({reason})", path.display())) } - false } } } @@ -576,7 +579,7 @@ fn validate_path_is_executable(path: &Path) -> Result<(), String> { 0xFEED_FACE | 0xFEED_FACF | 0xCAFE_BABE | 0xBEBA_FECA ); if !(pe || elf || macho) { - return Err(format!("bad magic {:02x?}", head)); + return Err(format!("bad magic {head:02x?}")); } if pe { f.seek(SeekFrom::Start(0x3C)) @@ -623,6 +626,14 @@ fn is_owned_slot(path: &Path) -> bool { } pub fn daemon_binary_candidates() -> Vec { + daemon_binary_candidates_diag().0 +} + +/// Like [`daemon_binary_candidates`] but also returns the human-readable +/// reason each rejected path was skipped. `ensure_daemon_running` holds +/// these and only prints them if the whole search fails, so a successful +/// launch doesn't spam the log with every probed-and-skipped location. +fn daemon_binary_candidates_diag() -> (Vec, Vec) { let exe = if cfg!(windows) { "myownmesh.exe" } else { @@ -633,23 +644,25 @@ pub fn daemon_binary_candidates() -> Vec { // sidecars next to the dev exe; `tauri build` strips it. // Checking both covers dev + production from one runtime path. let exe_with_triple = if cfg!(windows) { - format!("myownmesh-{}.exe", DAEMON_SIDECAR_TRIPLE) + format!("myownmesh-{DAEMON_SIDECAR_TRIPLE}.exe") } else { - format!("myownmesh-{}", DAEMON_SIDECAR_TRIPLE) + format!("myownmesh-{DAEMON_SIDECAR_TRIPLE}") }; let mut out: Vec = Vec::new(); + let mut diags: Vec = Vec::new(); // Helper: push a candidate iff it exists AND looks like a // real executable (filters out the zero-byte stub // `build.rs` writes when the daemon fetch was skipped, AND // filters out corrupt / truncated downloads that would // otherwise produce a confusing "%1 is not a valid Win32 - // application" error when we try to spawn them). - fn push_if_usable(out: &mut Vec, p: PathBuf) { - if !looks_like_executable(&p) { - return; + // application" error when we try to spawn them). Rejection + // reasons are collected into `diags` rather than logged here. + fn push_if_usable(out: &mut Vec, diags: &mut Vec, p: PathBuf) { + match check_executable(&p) { + Ok(()) => out.push(p), + Err(reason) => diags.push(reason), } - out.push(p); } // 1. Bundled sidecar next to the running LLM executable — @@ -659,8 +672,8 @@ pub fn daemon_binary_candidates() -> Vec { // via the same code path. if let Ok(exe_path) = std::env::current_exe() { if let Some(exe_dir) = exe_path.parent() { - push_if_usable(&mut out, exe_dir.join(exe)); - push_if_usable(&mut out, exe_dir.join(&exe_with_triple)); + push_if_usable(&mut out, &mut diags, exe_dir.join(exe)); + push_if_usable(&mut out, &mut diags, exe_dir.join(&exe_with_triple)); } } @@ -670,7 +683,11 @@ pub fn daemon_binary_candidates() -> Vec { // the *only* place the binary lives. Relative to the // crate, so it works from any working directory. let manifest = PathBuf::from(env!("CARGO_MANIFEST_DIR")); - push_if_usable(&mut out, manifest.join("binaries").join(&exe_with_triple)); + push_if_usable( + &mut out, + &mut diags, + manifest.join("binaries").join(&exe_with_triple), + ); // 2 + 3. Explicit env-var overrides. for var in ["MYOWNLLM_MESH_BIN", "MYOWNMESH_BIN"] { @@ -696,12 +713,21 @@ pub fn daemon_binary_candidates() -> Vec { // `manifest` already declared above for the `binaries/` // lookup; reuse it here. for profile in ["debug", "release"] { - push_if_usable(&mut out, manifest.join("target").join(profile).join(exe)); + push_if_usable( + &mut out, + &mut diags, + manifest.join("target").join(profile).join(exe), + ); if let Some(parent) = manifest.parent() { - push_if_usable(&mut out, parent.join("target").join(profile).join(exe)); + push_if_usable( + &mut out, + &mut diags, + parent.join("target").join(profile).join(exe), + ); if let Some(grandparent) = parent.parent() { push_if_usable( &mut out, + &mut diags, grandparent .join("MyOwnMesh") .join("target") @@ -719,7 +745,7 @@ pub fn daemon_binary_candidates() -> Vec { let canonical = std::fs::canonicalize(p).unwrap_or_else(|_| p.clone()); seen.insert(canonical) }); - out + (out, diags) } /// Legacy single-binary lookup. Returns the highest-priority @@ -786,8 +812,15 @@ pub async fn ensure_daemon_running() -> Result<(ControlClient, Option Result<(ControlClient, Option = candidates.iter().map(|p| p.display().to_string()).collect(); Err(anyhow!( "no working `myownmesh` binary on this machine. Tried:\n {}\nLast error: {}", diff --git a/src-tauri/src/mesh/identity.rs b/src-tauri/src/mesh/identity.rs index 7dea8e7..31dc1e7 100644 --- a/src-tauri/src/mesh/identity.rs +++ b/src-tauri/src/mesh/identity.rs @@ -17,5 +17,5 @@ //! `myownmesh-core` and aren't duplicated here. pub use myownmesh_core::identity::{ - generate_network_id, load_or_create, normalize_network_id, set_label, Identity, + generate_network_id, load_or_create, normalize_network_id, set_label, }; diff --git a/src-tauri/src/mesh/roster.rs b/src-tauri/src/mesh/roster.rs index 7f29ba8..44a60be 100644 --- a/src-tauri/src/mesh/roster.rs +++ b/src-tauri/src/mesh/roster.rs @@ -23,8 +23,7 @@ //! over. pub use myownmesh_core::roster::{ - add_peer, add_peer_in, delete, empty_for, is_authorized, load, remove_peer, remove_peer_in, - save, AuthorizedPeer, Roster, ROSTER_VERSION, + add_peer, delete, load, remove_peer, save, AuthorizedPeer, Roster, ROSTER_VERSION, }; /// One-shot migration from the pre-multi-network single roster file diff --git a/src-tauri/src/mesh/signing.rs b/src-tauri/src/mesh/signing.rs index 36066ec..6aaa009 100644 --- a/src-tauri/src/mesh/signing.rs +++ b/src-tauri/src/mesh/signing.rs @@ -8,4 +8,4 @@ //! which reads `MYOWNMESH_HOME` — set to `~/.myownllm` in `main.rs` //! — so the local Device ID is unchanged. -pub use myownmesh_core::signing::{pubkey_part, sign, verify}; +pub use myownmesh_core::signing::{sign, verify}; diff --git a/src-tauri/src/ollama.rs b/src-tauri/src/ollama.rs index 1157782..c89bf65 100644 --- a/src-tauri/src/ollama.rs +++ b/src-tauri/src/ollama.rs @@ -96,6 +96,10 @@ fn ensure_macos_default_on_path() -> bool { false } +// The trailing `Ok(())` is the Linux fall-through / unsupported-platform +// fallback; on macOS and Windows the cfg blocks above always return, so +// it's unreachable there by design — silence the per-platform lint. +#[allow(unreachable_code)] pub async fn install() -> Result<()> { #[cfg(target_os = "linux")] { @@ -274,33 +278,108 @@ pub async fn ensure_running() -> Result<()> { return Ok(()); } - // OLLAMA_ORIGINS=* belt-and-suspenders: when WE spawn the server (e.g. Linux - // or a fresh standalone Windows install), this lets the GUI fetch directly - // from `http://127.0.0.1:11434` without Ollama's CORS allowlist rejecting - // the WebView's `Origin` (which on Tauri 2 / Windows is `http://tauri.localhost`, - // not in Ollama's defaults). When the Windows installer runs Ollama as a - // tray service we can't influence its env — that's why the GUI also routes - // chat through myownllm's API server (see Chat.svelte). - let child = quiet_tokio_command("ollama") - .arg("serve") - .env("OLLAMA_ORIGINS", "*") - .stdout(Stdio::null()) - .stderr(Stdio::null()) - .spawn() - .context("failed to spawn ollama serve")?; + // Throttle the server at launch (Settings → Performance). Applying it + // as an argv prefix is the reliable lever — notably on macOS, where + // taskpolicy's IO policy only takes effect when launching a program, + // not via `-p` on a running PID (that gap left the throttle a no-op + // and let a load thrash the machine). "off" yields no prefix. + let mode = throttle_mode(); + let prefix = crate::process::throttle_launch_prefix(&mode); + // Spawn under the wrapper when we have one. If the wrapper binary is + // missing the spawn itself errors — fall back to a plain spawn so a + // missing throttle tool can never leave the app without an LLM. + let child = match spawn_ollama_serve(prefix.as_deref()) { + Ok(c) => c, + Err(_) if prefix.is_some() => { + spawn_ollama_serve(None).context("failed to spawn ollama serve")? + } + Err(e) => return Err(anyhow::Error::new(e).context("failed to spawn ollama serve")), + }; *guard = Some(child); - // Wait up to 10 seconds for API to become reachable. + // Windows throttles post-spawn (no reliable launch wrapper there); + // Unix already throttled via the argv prefix above. + #[cfg(target_os = "windows")] + if mode != "off" { + if let Some(pid) = guard.as_ref().and_then(|c| c.id()) { + crate::process::set_priority_windows(pid, &mode).await; + } + } + + // Wait up to 10s for the API. Bail early if a launch-wrapper child + // exited without exec'ing ollama (e.g. an unsupported flag on this OS + // version): try_wait → Some means the wrapper failed, so we stop + // waiting and retry directly below rather than burning the full 10s. + let mut up = false; for _ in 0..20 { tokio::time::sleep(std::time::Duration::from_millis(500)).await; if api_reachable().await { - return Ok(()); + up = true; + break; + } + if prefix.is_some() { + if let Some(c) = guard.as_mut() { + if matches!(c.try_wait(), Ok(Some(_))) { + break; // wrapper died without bringing ollama up + } + } + } + } + if up { + return Ok(()); + } + + // Wrapped launch never came up — retry with a direct spawn so a + // broken/incompatible throttle tool can't disable the LLM entirely. + if prefix.is_some() { + if let Some(mut dead) = guard.take() { + let _ = dead.kill().await; + } + let direct = spawn_ollama_serve(None).context("failed to spawn ollama serve")?; + *guard = Some(direct); + for _ in 0..20 { + tokio::time::sleep(std::time::Duration::from_millis(500)).await; + if api_reachable().await { + return Ok(()); + } } } Err(anyhow!("ollama serve did not become reachable within 10s")) } +/// Spawn `ollama serve` with our standard env, optionally under a +/// launch-time throttle wrapper (`prefix`, e.g. `["taskpolicy", "-d", +/// "throttle"]`). The wrapper tools exec their target, so the child PID +/// is ollama itself — kill-on-exit (`stop`) is unaffected. +/// +/// OLLAMA_ORIGINS=* is belt-and-suspenders: when WE spawn the server this +/// lets the GUI fetch directly from `http://127.0.0.1:11434` without +/// Ollama's CORS allowlist rejecting the WebView's `Origin`. The memory +/// caps keep at most one model resident and serve one request at a time, +/// so Ollama never tries to hold two models (or N parallel KV caches) in +/// RAM/VRAM at once — the swap thrash behind the hardest freezes. +fn spawn_ollama_serve(prefix: Option<&[&str]>) -> std::io::Result { + let mut cmd = match prefix { + Some(p) if !p.is_empty() => { + let mut c = quiet_tokio_command(p[0]); + c.args(&p[1..]).arg("ollama").arg("serve"); + c + } + _ => { + let mut c = quiet_tokio_command("ollama"); + c.arg("serve"); + c + } + }; + cmd.env("OLLAMA_ORIGINS", "*") + .env("OLLAMA_MAX_LOADED_MODELS", "1") + .env("OLLAMA_NUM_PARALLEL", "1") + .stdout(Stdio::null()) + .stderr(Stdio::null()) + .spawn() +} + async fn api_reachable() -> bool { reqwest_get("http://127.0.0.1:11434/").await.is_ok() } @@ -595,14 +674,41 @@ pub async fn has_model(model: &str) -> Result { Ok(out.success()) } +/// True when `model` is currently loaded in Ollama's memory — i.e. the +/// next chat won't pay a cold load. Queried via `/api/ps`. On any error +/// (Ollama down, curl missing, unparseable body) we return `false` so +/// callers fall back to showing the load dialog rather than wrongly +/// skipping it. Ollama reports loaded models under either `name` or +/// `model`, so we match on both. +pub async fn is_model_loaded(model: &str) -> bool { + let Ok(body) = reqwest_get("http://127.0.0.1:11434/api/ps").await else { + return false; + }; + let Ok(v) = serde_json::from_str::(&body) else { + return false; + }; + v.get("models") + .and_then(|m| m.as_array()) + .map(|arr| { + arr.iter().any(|e| { + e.get("name").and_then(|n| n.as_str()) == Some(model) + || e.get("model").and_then(|n| n.as_str()) == Some(model) + }) + }) + .unwrap_or(false) +} + /// Fire a 1-token chat call so Ollama mmaps the weights and keeps the model loaded -/// for `keep_alive`. Used by `myownllm preload --warm`. +/// for `keep_alive`. Used by `myownllm preload --warm` and the startup warm. +/// Honors the user's configured `keep_alive` so a proactive warm respects +/// the same residency window as real chats (warming with "0" would just +/// load-then-unload, so callers skip warming in that case). pub async fn warm(model: &str) -> Result<()> { let body = serde_json::json!({ "model": model, "messages": [{"role": "user", "content": "ok"}], "stream": false, - "keep_alive": "10m", + "keep_alive": chat_keep_alive(), "options": { "num_predict": 1 } }) .to_string(); @@ -730,6 +836,44 @@ pub enum ChatStreamOutcome { Cancelled, } +/// Resolve the user's configured Ollama `keep_alive` for chat +/// requests. This controls how long Ollama keeps the model resident +/// in RAM/VRAM after a turn finishes: longer values avoid cold-start +/// reloads between messages (the common "why is it slow again?" +/// complaint), shorter values free memory sooner so the LLM can +/// coexist with transcription on a memory-tight machine. Accepts +/// Ollama's native duration format — "30m", "1h", "0" (unload +/// immediately), "-1" (keep until evicted). Falls back to "30m" when +/// the config is unreadable or the key is absent (older configs). +fn chat_keep_alive() -> serde_json::Value { + crate::resolver::load_config_value() + .ok() + .and_then(|c| { + c.get("ollama_keep_alive") + .and_then(|v| v.as_str()) + .map(str::to_string) + }) + .map(serde_json::Value::from) + .unwrap_or_else(|| serde_json::json!("30m")) +} + +/// Resolve the user's configured throttle mode for the Ollama server we +/// spawn — how hard we ease its priority so model loading doesn't starve +/// the desktop: "off" (no throttle), "io" (disk-IO only; keeps inference +/// full speed — the default), or "aggressive" (also demote CPU/QoS; most +/// responsive machine but slower inference). Falls back to "io". +fn throttle_mode() -> String { + crate::resolver::load_config_value() + .ok() + .and_then(|c| { + c.get("ollama_throttle") + .and_then(|v| v.as_str()) + .map(str::to_string) + }) + .filter(|m| matches!(m.as_str(), "off" | "io" | "aggressive")) + .unwrap_or_else(|| "io".to_string()) +} + /// Streamed chat completion. Invokes `on_content` for each visible token /// chunk, `on_thinking` for any reasoning/thinking deltas (thinking models /// emit those in `message.thinking`; non-thinking models never call it), @@ -771,6 +915,7 @@ where "model": model, "messages": messages, "stream": true, + "keep_alive": chat_keep_alive(), }); if let Some(t) = think { body["think"] = serde_json::json!(t); @@ -933,6 +1078,7 @@ pub async fn chat_once( "model": model, "messages": messages, "stream": false, + "keep_alive": chat_keep_alive(), }); if let Some(opts) = options { body["options"] = opts; diff --git a/src-tauri/src/process.rs b/src-tauri/src/process.rs index 9d1eeab..7e84356 100644 --- a/src-tauri/src/process.rs +++ b/src-tauri/src/process.rs @@ -41,8 +41,93 @@ fn apply_quiet_flags(_cmd: &mut std::process::Command) {} #[cfg(target_os = "windows")] fn apply_quiet_flags_tokio(cmd: &mut tokio::process::Command) { - use std::os::windows::process::CommandExt; + // tokio's Command exposes `creation_flags` inherently — no CommandExt + // trait import needed (unlike the std::process variant above). cmd.creation_flags(CREATE_NO_WINDOW); } #[cfg(not(target_os = "windows"))] fn apply_quiet_flags_tokio(_cmd: &mut tokio::process::Command) {} + +/// Launch-time throttle wrapper for the LLM server, per the user's `mode` +/// ("io" | "aggressive"; "off" → `None`). Returned as an argv prefix to +/// prepend before `ollama serve` rather than something we apply to the +/// PID after spawn — that distinction matters on macOS, where +/// `taskpolicy`'s disk-IO policy only takes effect when *launching* a +/// program, not via `-p` on a running one. Applying it post-spawn was a +/// silent no-op, which left the server unthrottled and let a model load +/// thrash the whole machine. The wrapper tools (`ionice`/`nice`/ +/// `taskpolicy`) exec their target, so the resulting child PID is still +/// ollama and our kill-on-exit handling is unaffected. +/// +/// - `"io"` (default, "balanced"): a moderate `nice` (CPU) — plus a low +/// disk-IO class on Linux. `nice` only makes the server yield when +/// something else (the display server, networking, the WebView) wants +/// the CPU, so the machine stays responsive during a heavy load while +/// inference still gets the bulk of the cores when nothing competes. +/// Crucially it does NOT leave the CPU wide open to the server — which +/// is what starved the desktop and froze the machine — nor force it +/// onto efficiency cores like background QoS, so inference isn't +/// crippled. +/// - `"aggressive"`: deep `nice` / background QoS — most responsive +/// desktop during a load, but noticeably slower inference. +/// +/// `None` on Windows (it throttles post-spawn via [`set_priority_windows`] +/// instead) and for `"off"`. +pub fn throttle_launch_prefix(mode: &str) -> Option> { + if mode == "off" { + return None; + } + let aggressive = mode == "aggressive"; + #[cfg(target_os = "linux")] + { + Some(if aggressive { + // Max nice + idle IO class — server only runs when nothing + // else wants CPU or disk. Snappiest desktop, slowest model. + vec!["nice", "-n", "19", "ionice", "-c", "3"] + } else { + // Moderate nice so the system keeps headroom, plus low + // best-effort IO so disk reads yield under contention. The + // server still gets most of the CPU when it's the only thing + // running, so inference stays fast. + vec!["nice", "-n", "10", "ionice", "-c", "2", "-n", "7"] + }) + } + #[cfg(target_os = "macos")] + { + Some(if aggressive { + // Background QoS: efficiency cores + throttled compute & IO. + // Frees the machine most, but slows inference. + vec!["taskpolicy", "-b"] + } else { + // Moderate nice only — reserves CPU headroom for the system + // (display, networking) while leaving the server on the + // performance cores, so inference isn't kneecapped. `nice` is + // POSIX and always present, so this can't fail the launch. + vec!["nice", "-n", "10"] + }) + } + #[cfg(not(any(target_os = "linux", target_os = "macos")))] + { + let _ = aggressive; // consumed on platforms without a launch wrapper + None + } +} + +/// Windows-only: ease the spawned server's priority class after spawn. +/// Windows has no launch-time IO-throttle wrapper we can rely on and no +/// external per-process IO priority without FFI, so we nudge the priority +/// class instead — `BelowNormal` for the default, `Idle` when aggressive. +/// Best-effort; failure just means no throttle. +#[cfg(target_os = "windows")] +pub async fn set_priority_windows(pid: u32, mode: &str) { + let class = if mode == "aggressive" { + "Idle" + } else { + "BelowNormal" + }; + let script = format!("(Get-Process -Id {pid}).PriorityClass='{class}'"); + let _ = quiet_tokio_command("powershell") + .args(["-NoProfile", "-NonInteractive", "-Command", &script]) + .status() + .await; +} diff --git a/src-tauri/src/resolver.rs b/src-tauri/src/resolver.rs index 9a0e92e..84d53f4 100644 --- a/src-tauri/src/resolver.rs +++ b/src-tauri/src/resolver.rs @@ -783,6 +783,9 @@ pub fn default_config_value() -> Value { "active_family": "gemma4", "active_mode": "text", "model_cleanup_days": 1, + "ollama_keep_alive": "30m", + "ollama_throttle": "io", + "warm_on_startup": true, "kept_models": [], "mode_overrides": {}, "tracked_modes": ["text"], diff --git a/src-tauri/src/transcribe.rs b/src-tauri/src/transcribe.rs index 4dba775..93a8f65 100644 --- a/src-tauri/src/transcribe.rs +++ b/src-tauri/src/transcribe.rs @@ -1409,8 +1409,7 @@ fn run_session( count_pending_chunks(&buffer_dir), None, Some(format!( - "ASR inference error ({}/{}): {e:#}", - consecutive_errors, ASR_CONSECUTIVE_ERROR_LIMIT + "ASR inference error ({consecutive_errors}/{ASR_CONSECUTIVE_ERROR_LIMIT}): {e:#}" )), ), ); @@ -2128,8 +2127,7 @@ fn ingest_loop( count_pending_chunks(&buffer_dir), None, Some(format!( - "Backlog full ({:.0} s); dropping oldest chunk to stay live.", - MAX_BACKLOG_SECONDS + "Backlog full ({MAX_BACKLOG_SECONDS:.0} s); dropping oldest chunk to stay live." )), ), ); diff --git a/src-tauri/src/usage.rs b/src-tauri/src/usage.rs index 781ef9a..ba93fa5 100644 --- a/src-tauri/src/usage.rs +++ b/src-tauri/src/usage.rs @@ -281,6 +281,67 @@ fn cpu_count() -> Option { .map(|n| n.get() as u32) } +/// Sum a `ps -A -o %cpu=` dump (one float per process, each already a +/// share of a single core) and normalise by `cpus` into a 0..100 share +/// of total system CPU. `None` when nothing parses. Pure so it can be +/// unit-tested on any host even though its only caller is macOS. +#[cfg_attr(not(target_os = "macos"), allow(dead_code))] +fn parse_total_cpu_pct(ps_output: &str, cpus: f64) -> Option { + let cpus = if cpus <= 0.0 { 1.0 } else { cpus }; + let mut sum = 0.0; + let mut any = false; + for tok in ps_output.split_whitespace() { + if let Ok(v) = tok.parse::() { + sum += v; + any = true; + } + } + if !any { + return None; + } + Some((sum / cpus).clamp(0.0, 100.0)) +} + +/// Pull a page count out of a `vm_stat` line like +/// `Pages active: 123456.` — digits only, trailing '.' dropped. +#[cfg_attr(not(target_os = "macos"), allow(dead_code))] +fn parse_vm_stat_pages(vm_stat: &str, key: &str) -> Option { + for line in vm_stat.lines() { + if let Some(rest) = line.trim_start().strip_prefix(key) { + let digits: String = rest.chars().filter(|c| c.is_ascii_digit()).collect(); + if !digits.is_empty() { + return digits.parse::().ok(); + } + } + } + None +} + +/// Read the page size from a `vm_stat` header line, e.g. +/// "Mach Virtual Memory Statistics: (page size of 16384 bytes)". Using +/// the dump's own page size keeps the byte math consistent with its +/// page counts (Apple Silicon is 16 KiB, Intel 4 KiB, and `hw.pagesize` +/// doesn't always agree with the VM page size). +#[cfg_attr(not(target_os = "macos"), allow(dead_code))] +fn parse_vm_stat_page_size(vm_stat: &str) -> Option { + let line = vm_stat.lines().next()?; + let after = line.split("page size of").nth(1)?; + let tok = after.split_whitespace().next()?; + let digits: String = tok.chars().filter(|c| c.is_ascii_digit()).collect(); + digits.parse::().ok().filter(|&n| n > 0) +} + +/// macOS system "used" memory ≈ (active + wired + compressed) pages × +/// page size — the same components Activity Monitor reports as "Memory +/// Used". `None` if any component line is absent. +#[cfg_attr(not(target_os = "macos"), allow(dead_code))] +fn parse_vm_stat_used_bytes(vm_stat: &str, page_bytes: u64) -> Option { + let active = parse_vm_stat_pages(vm_stat, "Pages active:")?; + let wired = parse_vm_stat_pages(vm_stat, "Pages wired down:")?; + let compressed = parse_vm_stat_pages(vm_stat, "Pages occupied by compressor:")?; + Some((active + wired + compressed).saturating_mul(page_bytes)) +} + #[cfg(target_os = "linux")] fn cpu_brand() -> Option { let content = std::fs::read_to_string("/proc/cpuinfo").ok()?; @@ -450,9 +511,20 @@ fn sample_cpu() -> (Option, Option) { .and_then(|b| String::from_utf8(b).ok()) .and_then(|s| s.trim().parse::().ok()) .map(|v| (v / cpus).clamp(0.0, 100.0)); - // Total system CPU% on macOS would need host_statistics — skip and - // leave as None. The UI handles the missing value cleanly. - (app_pct, None) + // System CPU%: sum every process's ps %cpu (each a share of one + // core) and normalise by core count. ps reports a decaying average + // rather than a true instant, but it's a single fast call — no + // host_statistics FFI and no `top -l 2` second-sample stall that + // would block the poll — and tracks "is the machine busy" well + // enough for the load readout. + let total_pct = quiet_command("ps") + .args(["-A", "-o", "%cpu="]) + .output() + .ok() + .filter(|o| o.status.success()) + .and_then(|o| String::from_utf8(o.stdout).ok()) + .and_then(|s| parse_total_cpu_pct(&s, cpus)); + (app_pct, total_pct) } #[cfg(target_os = "windows")] @@ -541,7 +613,6 @@ fn win_process_cpu_seconds() -> Option { #[cfg(target_os = "windows")] fn win_total_cpu_times() -> Option<(u64, u64)> { // (idle ticks, total ticks). Returns 100ns ticks summed across cores. - use std::ffi::c_void; type Bool = i32; #[repr(C)] struct Filetime { @@ -635,9 +706,30 @@ fn sample_ram() -> (Option, Option, Option) { }) .and_then(|b| String::from_utf8(b).ok()) .and_then(|s| s.trim().parse::().ok()); - // System "used" via `vm_stat` page-counting is fiddly — leave as None; - // the UI handles missing values. - (app, total, None) + // System "used" ≈ (active + wired + compressed) pages × page size, + // the components Activity Monitor sums as "Memory Used". Page size + // differs by arch (16 KiB on Apple Silicon, 4 KiB on Intel), so + // read it rather than assume. + let page = quiet_command("sysctl") + .args(["-n", "hw.pagesize"]) + .output() + .ok() + .filter(|o| o.status.success()) + .and_then(|o| String::from_utf8(o.stdout).ok()) + .and_then(|s| s.trim().parse::().ok()) + .unwrap_or(4096); + let used = quiet_command("vm_stat") + .output() + .ok() + .filter(|o| o.status.success()) + .and_then(|o| String::from_utf8(o.stdout).ok()) + .and_then(|s| { + // Prefer vm_stat's own header page size; fall back to the + // sysctl value, then a 4 KiB default. + let page = parse_vm_stat_page_size(&s).unwrap_or(page); + parse_vm_stat_used_bytes(&s, page) + }); + (app, total, used) } #[cfg(target_os = "windows")] @@ -829,3 +921,56 @@ fn nvidia_app_vram_bytes() -> Option { } None } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn total_cpu_pct_sums_and_normalises() { + // Four processes at 50% of one core each, on a 4-core box → 50%. + let out = "50.0\n50.0\n50.0\n50.0\n"; + let pct = parse_total_cpu_pct(out, 4.0).unwrap(); + assert!((pct - 50.0).abs() < 1e-6, "got {pct}"); + } + + #[test] + fn total_cpu_pct_handles_blanks_and_clamps() { + assert_eq!(parse_total_cpu_pct("", 4.0), None); + assert_eq!(parse_total_cpu_pct(" \n \n", 4.0), None); + // Over-100% sum (transient ps quirk) clamps to 100. + assert_eq!(parse_total_cpu_pct("800.0\n", 4.0).unwrap(), 100.0); + // Zero/garbage cpu count is treated as 1, not a divide-by-zero. + assert_eq!(parse_total_cpu_pct("10.0\n", 0.0).unwrap(), 10.0); + } + + #[test] + fn vm_stat_used_sums_active_wired_compressed() { + let vm = "Mach Virtual Memory Statistics: (page size of 16384 bytes)\n\ +Pages free: 100000.\n\ +Pages active: 200000.\n\ +Pages inactive: 150000.\n\ +Pages speculative: 5000.\n\ +Pages wired down: 100000.\n\ +Pages occupied by compressor: 50000.\n"; + // (200000 + 100000 + 50000) pages × 16384 bytes. + let used = parse_vm_stat_used_bytes(vm, 16384).unwrap(); + assert_eq!(used, 350_000u64 * 16384); + } + + #[test] + fn vm_stat_page_size_parsed_from_header() { + let vm = "Mach Virtual Memory Statistics: (page size of 16384 bytes)\n\ +Pages active: 1.\n"; + assert_eq!(parse_vm_stat_page_size(vm), Some(16384)); + assert_eq!(parse_vm_stat_page_size("no header here"), None); + } + + #[test] + fn vm_stat_used_none_when_a_component_missing() { + // Has active + wired but no compressor line → None. + let vm = "Pages active: 10.\nPages wired down: 20.\n"; + assert_eq!(parse_vm_stat_used_bytes(vm, 4096), None); + assert_eq!(parse_vm_stat_used_bytes("garbage", 4096), None); + } +} diff --git a/src/config.ts b/src/config.ts index b7bda88..43235c6 100644 --- a/src/config.ts +++ b/src/config.ts @@ -111,6 +111,9 @@ const DEFAULT_CONFIG: Config = { // active_mode they persisted (mergeDefaults overlays raw on top). active_mode: "transcribe", model_cleanup_days: 1, + ollama_keep_alive: "30m", + ollama_throttle: "io", + warm_on_startup: true, cleanup_warning_suppressed_families: [], kept_models: [], mode_overrides: {}, diff --git a/src/types.ts b/src/types.ts index 77d2fe9..ed92052 100644 --- a/src/types.ts +++ b/src/types.ts @@ -11,6 +11,27 @@ export interface HardwareProfile { soc?: string | null; } +/** Live resource snapshot — mirrors the `LiveSnapshot` struct in + * src-tauri/src/usage.rs, returned by the `usage_live_snapshot` + * Tauri command. Every counter is optional on the Rust side so the + * UI renders "—" when a platform doesn't expose it. Shared between + * the Usage settings tab and the chat model-loading dialog so both + * read the same system lookups. */ +export interface LiveSnapshot { + cpu_app_pct: number | null; + cpu_total_pct: number | null; + ram_app_bytes: number | null; + ram_total_bytes: number | null; + ram_used_bytes: number | null; + gpu_pct: number | null; + vram_app_bytes: number | null; + vram_used_bytes: number | null; + vram_total_bytes: number | null; + process_uptime_seconds: number; + cpu_brand: string | null; + cpu_count: number | null; +} + export type Mode = "text" | "vision" | "code" | "transcribe" | "diarize"; /** Runtimes the resolver knows how to dispatch to. @@ -469,6 +490,25 @@ export interface Config { active_family: string; active_mode: Mode; model_cleanup_days: number; + /** Ollama `keep_alive` for chat requests — how long the model stays + * resident in RAM/VRAM after a turn before Ollama unloads it. + * Native Ollama duration format: "30m", "1h", "0" (unload right + * away, frees memory for transcription), "-1" (keep until evicted). + * Longer values avoid cold-start reloads between messages; shorter + * values suit memory-tight machines. Default "30m". */ + ollama_keep_alive: string; + /** How hard to throttle the Ollama server we spawn while it loads a + * model, so the disk thrash doesn't freeze the machine. "off" = no + * throttle; "io" = disk-IO priority only (keeps inference full speed, + * the default); "aggressive" = also demote CPU/QoS (most responsive + * desktop, slower inference). Only applies when MyOwnLLM spawns Ollama + * itself — not when it's a system/tray service. */ + ollama_throttle: "off" | "io" | "aggressive"; + /** Warm (preload) the active chat model in the background at startup so + * the first message doesn't pay the cold-load wait. On by default; the + * load runs under the configured throttle so it doesn't lock up the + * machine. Can be turned off in Settings → Performance. */ + warm_on_startup: boolean; /** Family names for which the user has dismissed the * "switching with auto-cleanup on" confirmation in the family * detail view's per-tier picker. Per-family rather than per-tier diff --git a/src/ui/App.svelte b/src/ui/App.svelte index acf7ef7..75f6a26 100644 --- a/src/ui/App.svelte +++ b/src/ui/App.svelte @@ -7,6 +7,7 @@ import Chat from "./Chat.svelte"; import TranscribeView from "./TranscribeView.svelte"; import Sidebar from "./Sidebar.svelte"; + import LoadingPulse from "./LoadingPulse.svelte"; import PermissionPromptModal from "./PermissionPromptModal.svelte"; import { loadConfig, updateConfig } from "../config"; import { getActiveManifest } from "../providers"; @@ -83,6 +84,12 @@ type View = "loading" | "chat"; let view = $state("loading"); + /** True while the startup warm is in flight: we keep a full-screen + * loading screen up (rather than dropping into a chat that feels + * sluggish because it's competing with the cold load) until the model + * is resident. The chat still mounts behind it, so it's fully ready + * when the screen lifts. A "Continue" button is the escape hatch. */ + let warming = $state(false); let appVersion = $state(""); let hardware = $state(null); let activeModel = $state(""); @@ -276,6 +283,31 @@ // run the install lazily). view = "chat"; invoke("ollama_ensure_running").catch(() => {}); + + // Warm the chat model so the first message doesn't pay the cold-load + // wait. On by default; the load runs under the configured throttle + // (Settings → Performance). Skipped when the user turned it off, when + // the model isn't on disk yet (the download overlay owns that), or + // when keep_alive is "0" (warming would just load-then-unload). + // + // We hold a full-screen loading screen (`warming`) over the chat — + // which keeps mounting/initializing behind it — until the warm + // settles, so the user lands on a chat that's actually ready instead + // of one that feels sluggish while it competes with the cold load. + if ( + config.warm_on_startup !== false && + pendingTextModel && + !textModelMissing && + config.ollama_keep_alive !== "0" + ) { + warming = true; + invoke("ollama_warm", { model: pendingTextModel }) + .catch(() => {}) + .finally(() => { + warming = false; + }); + } + kickUpdateCheck(); // Seed the sidebar early so it's ready when the chat view paints. @@ -1143,6 +1175,23 @@ the same modal; the modal self-hides when the prompt queue drains. --> + + {#if warming} + +
+
+ + + {#if appVersion} +

v{appVersion}

+ {/if} +
+ {/if} diff --git a/src/ui/SettingsPanel.svelte b/src/ui/SettingsPanel.svelte index 05436c3..4748e49 100644 --- a/src/ui/SettingsPanel.svelte +++ b/src/ui/SettingsPanel.svelte @@ -3,6 +3,7 @@ import ModelsSection from "./settings/ModelsSection.svelte"; import StorageSection from "./settings/StorageSection.svelte"; import HardwareSection from "./settings/HardwareSection.svelte"; + import PerformanceSection from "./settings/PerformanceSection.svelte"; import UsageSection from "./settings/UsageSection.svelte"; import UpdatesSection from "./settings/UpdatesSection.svelte"; import CloudMeshSection from "./settings/CloudMeshSection.svelte"; @@ -18,6 +19,7 @@ | "prompts" | "permissions" | "hardware" + | "performance" | "storage" | "usage" | "cloud-mesh" @@ -81,6 +83,7 @@ { id: "prompts", label: "Prompts" }, { id: "permissions", label: "Permissions" }, { id: "hardware", label: "Hardware" }, + { id: "performance", label: "Performance" }, { id: "storage", label: "Storage" }, { id: "usage", label: "Usage" }, { id: "updates", label: "Updates" }, @@ -151,6 +154,8 @@ (active = t)} /> {:else if active === "hardware"} (active = t)} /> + {:else if active === "performance"} + {:else if active === "usage"} {:else if active === "cloud-mesh"} diff --git a/src/ui/settings/PerformanceSection.svelte b/src/ui/settings/PerformanceSection.svelte new file mode 100644 index 0000000..54d939c --- /dev/null +++ b/src/ui/settings/PerformanceSection.svelte @@ -0,0 +1,244 @@ + + +
+
+

+ Tune how MyOwnLLM trades responsiveness against + load and inference speed when running models locally. + These apply to the Ollama server MyOwnLLM launches itself. +

+
+ + {#if loading} +

Loading…

+ {:else if error} +

{error}

+ {:else} +
+
+
Model memory
+ +
+
Keep model loaded
+

+ How long the chat model stays in memory after a reply. Longer + keeps later messages instant; shorter frees RAM/VRAM sooner — + handy when transcription needs to run alongside on a + memory-tight machine. +

+
+
+
Keep model loaded for
+
+ +
+
+
+
+ +
Loading
+ +
+
Load throttle
+

+ Loading a model reads gigabytes from disk, which can freeze a + laptop. This throttles those reads so the machine stays usable. + Loading is disk-bound and inference is compute-bound, so the + balanced default eases disk only and leaves token generation at + full speed. +

+
+
+
While a model loads
+
+ +
+
+
+ {#if throttleHint} +

{throttleHint}

+ {/if} +
+ +
+
Warm at startup
+

+ Preload the chat model in the background when the app starts, so + your first message doesn't wait for it to load. The load runs + under the throttle above, so it won't lock up the machine. +

+ +
+ +

+ Throttling only applies when MyOwnLLM starts the Ollama server + itself. If Ollama is already running as a system or tray service, + these settings don't affect it. +

+
+ +
+ {/if} +
+ + diff --git a/src/ui/settings/UsageSection.svelte b/src/ui/settings/UsageSection.svelte index 4912282..150fd21 100644 --- a/src/ui/settings/UsageSection.svelte +++ b/src/ui/settings/UsageSection.svelte @@ -2,24 +2,7 @@ import { onMount, onDestroy } from "svelte"; import { invoke } from "@tauri-apps/api/core"; import { scrollAffordance } from "../scroll-affordance"; - - // Mirrors the LiveSnapshot struct in src-tauri/src/usage.rs. Every - // field is optional on the Rust side so we can render "—" when a - // platform doesn't expose the underlying counter. - interface LiveSnapshot { - cpu_app_pct: number | null; - cpu_total_pct: number | null; - ram_app_bytes: number | null; - ram_total_bytes: number | null; - ram_used_bytes: number | null; - gpu_pct: number | null; - vram_app_bytes: number | null; - vram_used_bytes: number | null; - vram_total_bytes: number | null; - process_uptime_seconds: number; - cpu_brand: string | null; - cpu_count: number | null; - } + import type { LiveSnapshot } from "../../types"; // Mirrors UsageStats in src-tauri/src/usage.rs. interface UsageStats {