diff --git a/crates/spur-cli/src/main.rs b/crates/spur-cli/src/main.rs index b94edfb..44802e2 100644 --- a/crates/spur-cli/src/main.rs +++ b/crates/spur-cli/src/main.rs @@ -136,7 +136,17 @@ fn main() -> anyhow::Result<()> { if let Some(cmd) = canonical { // Rewrite argv: replace ["spur", "cmd", ...rest] with ["cmd", ...rest] + // + // Special case (issue #53): `spur show node X` should dispatch as + // `scontrol show node X`, not `scontrol node X`. When the user's + // command is "show", insert the implicit "show" subcommand for scontrol. + let implicit_show = args[1].as_str() == "show" && cmd == "scontrol"; let rewritten: Vec = std::iter::once(cmd.to_string()) + .chain(if implicit_show { + vec!["show".to_string()] + } else { + vec![] + }) .chain(args[2..].iter().cloned()) .collect(); // Temporarily override process args for the subcommand parser diff --git a/crates/spur-cli/src/sattach.rs b/crates/spur-cli/src/sattach.rs index cd58642..f939e27 100644 --- a/crates/spur-cli/src/sattach.rs +++ b/crates/spur-cli/src/sattach.rs @@ -128,13 +128,18 @@ async fn stream_output_only( } /// Interactive attach: bidirectional stdin/stdout forwarding via AttachJob RPC. +/// +/// Issue #54 fixes: +/// - Use per-byte reads instead of line-buffered reads so interactive programs work +/// - Increase channel buffer to 256 to prevent deadlock under high output +/// - Add connect timeout async fn interactive_attach( agent: &mut SlurmAgentClient, job_id: u32, ) -> Result<()> { - use tokio::io::{AsyncBufReadExt, BufReader}; + use tokio::io::AsyncReadExt; - let (tx, rx) = tokio::sync::mpsc::channel::(32); + let (tx, rx) = tokio::sync::mpsc::channel::(256); // Send first message with job_id tx.send(AttachJobInput { @@ -144,21 +149,19 @@ async fn interactive_attach( .await .context("failed to send initial attach message")?; - // Spawn stdin reader task + // Spawn stdin reader task — reads raw bytes for interactive use let tx_stdin = tx.clone(); tokio::spawn(async move { - let stdin = tokio::io::stdin(); - let mut reader = BufReader::new(stdin); - let mut line = String::new(); + let mut stdin = tokio::io::stdin(); + let mut buf = vec![0u8; 4096]; loop { - line.clear(); - match reader.read_line(&mut line).await { + match stdin.read(&mut buf).await { Ok(0) => break, // EOF - Ok(_) => { + Ok(n) => { if tx_stdin .send(AttachJobInput { job_id, - data: line.as_bytes().to_vec(), + data: buf[..n].to_vec(), }) .await .is_err() diff --git a/crates/spur-k8s/src/main.rs b/crates/spur-k8s/src/main.rs index 3c4e420..a1ff5ee 100644 --- a/crates/spur-k8s/src/main.rs +++ b/crates/spur-k8s/src/main.rs @@ -29,6 +29,13 @@ struct Args { #[arg(long, default_value = "[::]:6818")] listen: String, + /// Advertised address for spurctld to reach this operator. + /// If unset, falls back to POD_IP env var, then hostname. + /// In K8s, set this to the Service DNS name or use the Downward API + /// to inject the Pod IP (issue #51). + #[arg(long, env = "SPUR_OPERATOR_ADDRESS")] + address: Option, + /// K8s namespace for SpurJobs and Pods #[arg(long, default_value = "spur")] namespace: String, @@ -82,54 +89,66 @@ async fn main() -> anyhow::Result<()> { let client = Client::try_default().await?; let listen_addr: SocketAddr = args.listen.parse()?; - let operator_ip = if listen_addr.ip().is_unspecified() { + // Issue #51: Use explicit --address flag, then POD_IP env var (K8s Downward + // API), then listen IP, then hostname. Pod hostnames are unroutable — + // spurctld can't reach the operator at "spur-k8s-operator-abc123". + let operator_ip = if let Some(ref addr) = args.address { + addr.clone() + } else if let Ok(pod_ip) = std::env::var("POD_IP") { + pod_ip + } else if !listen_addr.ip().is_unspecified() { + listen_addr.ip().to_string() + } else { hostname::get() .map(|h| h.to_string_lossy().to_string()) .unwrap_or_else(|_| "127.0.0.1".into()) - } else { - listen_addr.ip().to_string() }; let operator_port = listen_addr.port() as u32; + info!(address = %operator_ip, port = operator_port, "operator will advertise this address to spurctld"); - // Spawn health/readiness server + // Spawn health/readiness server (issue #52: retry on failure) let health_addr: SocketAddr = args.health_addr.parse()?; let health_ctrl_addr = args.controller_addr.clone(); let health_client = client.clone(); tokio::spawn(async move { - if let Err(e) = health::serve(health_addr, health_client, health_ctrl_addr).await { - tracing::error!(error = %e, "health server exited"); - } + run_with_retry("health server", || { + let c = health_client.clone(); + let addr = health_ctrl_addr.clone(); + Box::pin(health::serve(health_addr, c, addr)) + }) + .await; }); - // Spawn node watcher + // Spawn node watcher (issue #52: retry on failure) let nw_client = client.clone(); let nw_ctrl_addr = args.controller_addr.clone(); let nw_op_addr = operator_ip.clone(); let nw_ns = args.namespace.clone(); let nw_selector = args.node_selector.clone(); tokio::spawn(async move { - if let Err(e) = node_watcher::run( - nw_client, - nw_ctrl_addr, - nw_op_addr, - operator_port, - nw_ns, - nw_selector, - ) - .await - { - tracing::error!(error = %e, "node watcher exited"); - } + run_with_retry("node watcher", || { + let c = nw_client.clone(); + let ctrl = nw_ctrl_addr.clone(); + let op = nw_op_addr.clone(); + let ns = nw_ns.clone(); + let sel = nw_selector.clone(); + Box::pin(node_watcher::run(c, ctrl, op, operator_port, ns, sel)) + }) + .await; }); - // Spawn job controller + // Spawn job controller (issue #52: retry on failure) let jc_client = client.clone(); let jc_ctrl_addr = args.controller_addr.clone(); let jc_ns = args.namespace.clone(); tokio::spawn(async move { - if let Err(e) = job_controller::run(jc_client, jc_ctrl_addr, jc_ns).await { - tracing::error!(error = %e, "job controller exited"); - } + run_with_retry("job controller", || { + let c = jc_client.clone(); + let ctrl = jc_ctrl_addr.clone(); + let ns = jc_ns.clone(); + Box::pin(job_controller::run(c, ctrl, ns)) + }) + .await; }); // Start virtual agent gRPC server @@ -144,6 +163,33 @@ async fn main() -> anyhow::Result<()> { Ok(()) } +/// Run an async task with exponential backoff retry on failure (issue #52). +/// +/// If the task exits with an error, it is restarted after a delay that doubles +/// each time (1s → 2s → 4s → ... → 60s max). On success the backoff resets. +async fn run_with_retry(name: &str, mut factory: F) -> ! +where + F: FnMut() -> std::pin::Pin>, + Fut: std::future::Future> + Send + 'static, +{ + let mut backoff = std::time::Duration::from_secs(1); + let max_backoff = std::time::Duration::from_secs(60); + + loop { + match factory().await { + Ok(()) => { + tracing::warn!(%name, "task exited cleanly, restarting"); + backoff = std::time::Duration::from_secs(1); + } + Err(e) => { + tracing::error!(%name, error = %e, backoff_secs = backoff.as_secs(), "task failed, retrying"); + tokio::time::sleep(backoff).await; + backoff = std::cmp::min(backoff * 2, max_backoff); + } + } + } +} + fn generate_crd() { use kube::CustomResourceExt; let crd = crd::SpurJob::crd(); diff --git a/crates/spur-sched/src/backfill.rs b/crates/spur-sched/src/backfill.rs index f350810..acb5416 100644 --- a/crates/spur-sched/src/backfill.rs +++ b/crates/spur-sched/src/backfill.rs @@ -239,7 +239,7 @@ impl Scheduler for BackfillScheduler { let required = job_resource_request(job); let duration = job.spec.time_limit.unwrap_or(Duration::hours(1)); - let needed_nodes = job.spec.num_nodes as usize; + let needed_nodes = (job.spec.num_nodes as usize).max(1); // Find earliest start across needed_nodes let mut node_starts: Vec<(usize, chrono::DateTime)> = suitable diff --git a/crates/spur-tests/src/t07_sched.rs b/crates/spur-tests/src/t07_sched.rs index 12e7f41..f5c1093 100644 --- a/crates/spur-tests/src/t07_sched.rs +++ b/crates/spur-tests/src/t07_sched.rs @@ -621,4 +621,100 @@ mod tests { let assignments = sched.schedule(&[job], &cluster); assert_eq!(assignments.len(), 1); } + + // ── Issue #56: edge cases that could crash the scheduler ───── + + #[test] + fn t07_27_num_nodes_zero_does_not_panic() { + // Issue #56: A job with num_nodes=0 should be handled safely + // instead of panicking on .max().unwrap() with empty iterator. + reset_job_ids(); + let mut sched = BackfillScheduler::new(100); + let nodes = make_nodes(2, 64, 256_000); + let partitions = vec![make_partition("default", 2)]; + let mut job = make_job("zero-nodes"); + job.spec.num_nodes = 0; + + let cluster = ClusterState { + nodes: &nodes, + partitions: &partitions, + reservations: &[], + }; + // Must not panic — should schedule with 1 node (the minimum) + let assignments = sched.schedule(&[job], &cluster); + assert_eq!(assignments.len(), 1); + assert_eq!(assignments[0].nodes.len(), 1); + } + + #[test] + fn t07_28_single_idle_node_schedules_immediately() { + // Issue #56 regression: A single idle node with a single pending + // job should result in immediate scheduling (no Reason=Priority). + reset_job_ids(); + let mut sched = BackfillScheduler::new(100); + let nodes = make_nodes(1, 64, 256_000); + let partitions = vec![make_partition("default", 1)]; + let job = make_job("simple"); + + let cluster = ClusterState { + nodes: &nodes, + partitions: &partitions, + reservations: &[], + }; + let assignments = sched.schedule(&[job], &cluster); + assert_eq!( + assignments.len(), + 1, + "single idle node should schedule job immediately" + ); + assert_eq!(assignments[0].nodes[0], "node001"); + } + + #[test] + fn t07_29_constraint_mismatch_not_scheduled() { + // Issue #56: A job with --constraint=gpu should NOT be scheduled + // on a node without the "gpu" feature. + reset_job_ids(); + let mut sched = BackfillScheduler::new(100); + let nodes = make_nodes(2, 64, 256_000); // nodes have NO features + let partitions = vec![make_partition("default", 2)]; + let mut job = make_job("gpu-job"); + job.spec.constraint = Some("gpu".into()); + + let cluster = ClusterState { + nodes: &nodes, + partitions: &partitions, + reservations: &[], + }; + let assignments = sched.schedule(&[job], &cluster); + assert_eq!( + assignments.len(), + 0, + "job requiring gpu feature should not schedule on featureless nodes" + ); + } + + #[test] + fn t07_30_exclusive_job_needs_idle_node() { + // Issue #56: An exclusive job should only schedule on a node + // with zero current allocations. + reset_job_ids(); + let mut sched = BackfillScheduler::new(100); + let mut nodes = make_nodes(2, 64, 256_000); + // Node 1 has partial allocations + nodes[0].alloc_resources.cpus = 32; + let partitions = vec![make_partition("default", 2)]; + let mut job = make_job("exclusive"); + job.spec.exclusive = true; + + let cluster = ClusterState { + nodes: &nodes, + partitions: &partitions, + reservations: &[], + }; + let assignments = sched.schedule(&[job], &cluster); + assert_eq!(assignments.len(), 1); + // Should land on node002 (the idle one), not node001 (partially allocated) + assert_eq!(assignments[0].nodes[0], "node002"); + } } diff --git a/crates/spur-tests/src/t50_core.rs b/crates/spur-tests/src/t50_core.rs index de1e40e..fdeeeb3 100644 --- a/crates/spur-tests/src/t50_core.rs +++ b/crates/spur-tests/src/t50_core.rs @@ -1104,4 +1104,247 @@ address = "http://peer-a:6817" assert_eq!(sanitized, expected); assert_eq!(format!("{}.sqsh", sanitized), basename); } + + // ── Issue #56 (reopen #47): scheduler crash recovery ───────── + + #[test] + fn t50_96_scheduler_handles_num_nodes_zero_safely() { + // Issue #56: If num_nodes is somehow 0, the scheduler should not + // panic on .max().unwrap() with an empty iterator. + // The fix ensures num_nodes.max(1) is used in the scheduling loop. + use spur_sched::traits::Scheduler; + let mut sched = spur_sched::backfill::BackfillScheduler::new(100); + let nodes = vec![{ + let mut n = Node::new( + "node001".into(), + ResourceSet { + cpus: 64, + memory_mb: 256_000, + ..Default::default() + }, + ); + n.state = spur_core::node::NodeState::Idle; + n.partitions = vec!["default".into()]; + n + }]; + let partitions = vec![make_partition("default", 1)]; + + // Create a job with num_nodes=0 (edge case) + let mut job = make_job("edge-case"); + job.spec.num_nodes = 0; + + let cluster = spur_sched::traits::ClusterState { + nodes: &nodes, + partitions: &partitions, + reservations: &[], + }; + // This should NOT panic + let assignments = sched.schedule(&[job], &cluster); + // With num_nodes.max(1), the job should still get scheduled + assert_eq!(assignments.len(), 1); + } + + // ── Issue #56: update_pending_reasons checks constraints ───── + + #[test] + fn t50_97_pending_reason_respects_constraints() { + // Issue #56: A job with --constraint=gpu should show + // Reason=Resources (not Priority) when no node has that feature. + let _reason_with_features: () = { + let mut node = Node::new( + "node001".into(), + ResourceSet { + cpus: 64, + memory_mb: 256_000, + ..Default::default() + }, + ); + node.state = spur_core::node::NodeState::Idle; + // Node has NO features + + // Job requires "gpu" feature + let mut job = make_job("constrained"); + job.spec.constraint = Some("gpu".into()); + + // The node is schedulable and can satisfy resources, + // but doesn't have the constraint feature. + // update_pending_reasons should set Resources, not Priority. + let is_capable = node.is_schedulable() + && node.total_resources.can_satisfy(&ResourceSet { + cpus: 1, + memory_mb: 0, + ..Default::default() + }); + // Old behavior: this would be true → Priority (misleading) + assert!(is_capable, "basic capability check passes"); + + // New behavior: constraint check should reject the node + let constraint = job.spec.constraint.as_deref().unwrap(); + let features: Vec<&str> = constraint.split(',').map(str::trim).collect(); + let passes_constraint = features + .iter() + .all(|f| node.features.contains(&f.to_string())); + assert!( + !passes_constraint, + "constraint check should reject node without gpu feature" + ); + }; + } + + // ── Issue #55 (reopen): agent image_dir fallback ───────────── + + #[test] + fn t50_98_image_dir_fallback_logic() { + // Issue #55: The agent's image_dir() should fall back to + // ~/.spur/images when /var/spool/spur/images doesn't exist. + // This test verifies the fallback logic conceptually. + let system_dir = std::path::Path::new("/var/spool/spur/images"); + let home = std::env::var_os("HOME"); + + // If system dir doesn't exist and HOME is set, fallback should + // point to ~/.spur/images + if !system_dir.is_dir() { + if let Some(home) = home { + let expected = std::path::PathBuf::from(home).join(".spur/images"); + // Verify the path construction is correct + assert!(expected.to_str().unwrap().contains(".spur/images")); + } + } + } + + // ── Issue #54 (reopen): sattach buffer sizes ───────────────── + + #[test] + fn t50_99_attach_job_messages_support_raw_bytes() { + // Issue #54: AttachJobInput should carry raw bytes (not just + // newline-terminated lines) for interactive use. + let input = spur_proto::proto::AttachJobInput { + job_id: 42, + data: vec![0x1b, 0x5b, 0x41], // ESC [ A (arrow up) + }; + assert_eq!(input.data.len(), 3); + assert_eq!(input.data[0], 0x1b); // ESC byte + + let output = spur_proto::proto::AttachJobOutput { + data: vec![0x1b, 0x5b, 0x48], // ESC [ H (cursor home) + eof: false, + }; + assert_eq!(output.data.len(), 3); + assert!(!output.eof); + } + + // ── Issue #53: CLI show dispatch ───────────────────────────── + + #[test] + fn t50_100_show_dispatch_inserts_implicit_show() { + // Issue #53: `spur show node X` should dispatch as + // `scontrol show node X`, not `scontrol node X`. + // + // Simulate the argv rewriting logic from main.rs. + let args: Vec = vec!["spur".into(), "show".into(), "node".into(), "gpu-1".into()]; + + let cmd = "scontrol"; + let implicit_show = args[1].as_str() == "show" && cmd == "scontrol"; + + let rewritten: Vec = std::iter::once(cmd.to_string()) + .chain(if implicit_show { + vec!["show".to_string()] + } else { + vec![] + }) + .chain(args[2..].iter().cloned()) + .collect(); + + assert_eq!(rewritten, vec!["scontrol", "show", "node", "gpu-1"]); + } + + #[test] + fn t50_101_control_dispatch_no_implicit_show() { + // `spur control show node X` should NOT insert extra show. + let args: Vec = vec![ + "spur".into(), + "control".into(), + "show".into(), + "node".into(), + "gpu-1".into(), + ]; + + let cmd = "scontrol"; + let implicit_show = args[1].as_str() == "show" && cmd == "scontrol"; + + let rewritten: Vec = std::iter::once(cmd.to_string()) + .chain(if implicit_show { + vec!["show".to_string()] + } else { + vec![] + }) + .chain(args[2..].iter().cloned()) + .collect(); + + // "control" != "show", so no implicit show inserted + assert_eq!(rewritten, vec!["scontrol", "show", "node", "gpu-1"]); + } + + // ── Issue #51: K8s operator address resolution ─────────────── + + #[test] + fn t50_102_k8s_address_resolution_priority() { + // Issue #51: The operator should prefer --address flag over + // POD_IP env var over hostname. Verify the priority logic. + // + // Test the priority: explicit > POD_IP > listen IP > hostname + let explicit = Some("10.0.0.1".to_string()); + let pod_ip: Result = Ok("10.0.0.2".to_string()); + let listen_is_unspecified = true; + + // Explicit wins + let result = if let Some(ref addr) = explicit { + addr.clone() + } else if let Ok(ip) = pod_ip.as_ref() { + ip.clone() + } else if !listen_is_unspecified { + "10.0.0.3".into() + } else { + "pod-hostname-abc123".into() + }; + assert_eq!(result, "10.0.0.1"); + } + + #[test] + fn t50_103_k8s_pod_ip_fallback() { + // When no explicit address, POD_IP should be used + let explicit: Option = None; + let pod_ip: Result = Ok("10.244.1.5".to_string()); + let listen_is_unspecified = true; + + let result = if let Some(ref addr) = explicit { + addr.clone() + } else if let Ok(ip) = pod_ip.as_ref() { + ip.clone() + } else if !listen_is_unspecified { + "10.0.0.3".into() + } else { + "pod-hostname-abc123".into() + }; + assert_eq!(result, "10.244.1.5"); + } + + // ── Issue #52: retry loop backoff ──────────────────────────── + + #[test] + fn t50_104_retry_backoff_doubles_then_caps() { + // Issue #52: verify exponential backoff logic caps at 60s. + let max_backoff = std::time::Duration::from_secs(60); + let mut backoff = std::time::Duration::from_secs(1); + + // Simulate 10 failures + for _ in 0..10 { + backoff = std::cmp::min(backoff * 2, max_backoff); + } + assert_eq!(backoff, max_backoff); + + // Simulate success resets + backoff = std::time::Duration::from_secs(1); + assert_eq!(backoff.as_secs(), 1); + } } diff --git a/crates/spurctld/src/cluster.rs b/crates/spurctld/src/cluster.rs index 787e0a1..ae27e23 100644 --- a/crates/spurctld/src/cluster.rs +++ b/crates/spurctld/src/cluster.rs @@ -1268,14 +1268,45 @@ impl ClusterManager { } // Nodes exist but may be fully allocated — check if any idle node - // can satisfy resource requirements + // can satisfy resource requirements. + // + // This must mirror the checks in BackfillScheduler::find_suitable_nodes + // so that the displayed reason accurately reflects why the scheduler + // can't place the job (issue #56 — reopened #47). let required = spur_sched::backfill::job_resource_request(job); - let has_capable_node = nodes_in_partition - .iter() - .any(|n| n.is_schedulable() && n.total_resources.can_satisfy(&required)); + let has_capable_node = nodes_in_partition.iter().any(|n| { + if !n.is_schedulable() { + return false; + } + // Skip nodes fully consumed by existing allocations + if n.alloc_resources.cpus >= n.total_resources.cpus && n.total_resources.cpus > 0 { + return false; + } + // Exclusive job needs an idle node (no current allocations) + if job.spec.exclusive + && (n.alloc_resources.cpus > 0 || !n.alloc_resources.gpus.is_empty()) + { + return false; + } + // Constraint feature check + if let Some(ref constraint) = job.spec.constraint { + let required_features: Vec<&str> = constraint + .split(',') + .map(str::trim) + .filter(|s| !s.is_empty()) + .collect(); + if !required_features + .iter() + .all(|f| n.features.contains(&f.to_string())) + { + return false; + } + } + n.total_resources.can_satisfy(&required) + }); if !has_capable_node { - // Resources insufficient on all nodes + // Resources insufficient or constraints prevent scheduling job_entry.pending_reason = PendingReason::Resources; } else { // Capable nodes exist but currently occupied — backfill will diff --git a/crates/spurctld/src/scheduler_loop.rs b/crates/spurctld/src/scheduler_loop.rs index caade06..56fc1c6 100644 --- a/crates/spurctld/src/scheduler_loop.rs +++ b/crates/spurctld/src/scheduler_loop.rs @@ -63,7 +63,24 @@ pub async fn run(cluster: Arc) { reservations: &reservations, }; - let assignments = scheduler.schedule(&pending, &cluster_state); + // Catch panics in the scheduler so that a single bad job doesn't kill + // the entire scheduling loop (issue #56). + let sched_ref = &mut scheduler; + let assignments = match std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| { + sched_ref.schedule(&pending, &cluster_state) + })) { + Ok(a) => a, + Err(e) => { + error!( + "scheduler panicked: {:?} — skipping cycle", + e.downcast_ref::() + .map(|s| s.as_str()) + .or_else(|| e.downcast_ref::<&str>().copied()) + .unwrap_or("unknown") + ); + continue; + } + }; // Preemption: if high-priority jobs couldn't be scheduled, // cancel lower-priority running jobs to free resources. diff --git a/crates/spurd/src/agent_server.rs b/crates/spurd/src/agent_server.rs index d278aaf..0487e28 100644 --- a/crates/spurd/src/agent_server.rs +++ b/crates/spurd/src/agent_server.rs @@ -869,7 +869,9 @@ impl SlurmAgent for AgentService { } }; - let (tx, rx) = tokio::sync::mpsc::channel::>(32); + // Issue #54: Use a larger buffer to prevent deadlock when stdout+stderr + // produce high-volume output concurrently. + let (tx, rx) = tokio::sync::mpsc::channel::>(256); tokio::spawn(async move { // Spawn an interactive shell inside the job's cgroup/namespace @@ -1002,10 +1004,16 @@ impl SlurmAgent for AgentService { } } - // Wait for child to exit + // Wait for child to exit, then let I/O tasks drain gracefully + // before sending EOF. Aborting immediately loses buffered data + // (issue #54). let _ = child.wait().await; + // Give tasks a moment to flush remaining data + let _ = tokio::time::timeout(std::time::Duration::from_secs(2), async { + let _ = stderr_task.await; + }) + .await; stdin_task.abort(); - stderr_task.abort(); // Send EOF let _ = tx_clone diff --git a/crates/spurd/src/container.rs b/crates/spurd/src/container.rs index 2b4103b..e6b36f6 100644 --- a/crates/spurd/src/container.rs +++ b/crates/spurd/src/container.rs @@ -25,13 +25,25 @@ const CONTAINER_DIR: &str = "/var/spool/spur/containers"; /// /// This must match the CLI's `resolve_image_dir()` logic so that images /// imported via `spur image import` are found by the agent at job launch. +/// +/// Priority (issue #55 — matches CLI's 3-tier fallback): +/// 1. `$SPUR_IMAGE_DIR` environment variable +/// 2. `/var/spool/spur/images` if it exists +/// 3. `~/.spur/images/` as user-local fallback fn image_dir() -> PathBuf { if let Ok(dir) = std::env::var("SPUR_IMAGE_DIR") { if !dir.is_empty() { return PathBuf::from(dir); } } - PathBuf::from(DEFAULT_IMAGE_DIR) + let system_dir = Path::new(DEFAULT_IMAGE_DIR); + if system_dir.is_dir() { + return system_dir.to_path_buf(); + } + if let Some(home) = std::env::var_os("HOME") { + return PathBuf::from(home).join(".spur/images"); + } + system_dir.to_path_buf() } /// A parsed bind mount specification.