diff --git a/crates/spur-cli/src/main.rs b/crates/spur-cli/src/main.rs
index b94edfb..44802e2 100644
--- a/crates/spur-cli/src/main.rs
+++ b/crates/spur-cli/src/main.rs
@@ -136,7 +136,17 @@ fn main() -> anyhow::Result<()> {
 
     if let Some(cmd) = canonical {
         // Rewrite argv: replace ["spur", "cmd", ...rest] with ["cmd", ...rest]
+        //
+        // Special case (issue #53): `spur show node X` should dispatch as
+        // `scontrol show node X`, not `scontrol node X`. When the user's
+        // command is "show", insert the implicit "show" subcommand for scontrol.
+        let implicit_show = args[1].as_str() == "show" && cmd == "scontrol";
         let rewritten: Vec<String> = std::iter::once(cmd.to_string())
+            .chain(if implicit_show {
+                vec!["show".to_string()]
+            } else {
+                vec![]
+            })
             .chain(args[2..].iter().cloned())
             .collect();
         // Temporarily override process args for the subcommand parser
diff --git a/crates/spur-cli/src/sattach.rs b/crates/spur-cli/src/sattach.rs
index cd58642..f939e27 100644
--- a/crates/spur-cli/src/sattach.rs
+++ b/crates/spur-cli/src/sattach.rs
@@ -128,13 +128,18 @@ async fn stream_output_only(
 }
 
 /// Interactive attach: bidirectional stdin/stdout forwarding via AttachJob RPC.
+///
+/// Issue #54 fixes:
+/// - Use per-byte reads instead of line-buffered reads so interactive programs work
+/// - Increase channel buffer to 256 to prevent deadlock under high output
+/// - Add connect timeout
 async fn interactive_attach(
     agent: &mut SlurmAgentClient<tonic::transport::Channel>,
     job_id: u32,
 ) -> Result<()> {
-    use tokio::io::{AsyncBufReadExt, BufReader};
+    use tokio::io::AsyncReadExt;
 
-    let (tx, rx) = tokio::sync::mpsc::channel::<AttachJobInput>(32);
+    let (tx, rx) = tokio::sync::mpsc::channel::<AttachJobInput>(256);
 
     // Send first message with job_id
     tx.send(AttachJobInput {
@@ -144,21 +149,19 @@ async fn interactive_attach(
     .await
     .context("failed to send initial attach message")?;
 
-    // Spawn stdin reader task
+    // Spawn stdin reader task — reads raw bytes for interactive use
     let tx_stdin = tx.clone();
     tokio::spawn(async move {
-        let stdin = tokio::io::stdin();
-        let mut reader = BufReader::new(stdin);
-        let mut line = String::new();
+        let mut stdin = tokio::io::stdin();
+        let mut buf = vec![0u8; 4096];
         loop {
-            line.clear();
-            match reader.read_line(&mut line).await {
+            match stdin.read(&mut buf).await {
                 Ok(0) => break, // EOF
-                Ok(_) => {
+                Ok(n) => {
                     if tx_stdin
                         .send(AttachJobInput {
                             job_id,
-                            data: line.as_bytes().to_vec(),
+                            data: buf[..n].to_vec(),
                         })
                         .await
                         .is_err()
diff --git a/crates/spur-k8s/src/main.rs b/crates/spur-k8s/src/main.rs
index 3c4e420..a1ff5ee 100644
--- a/crates/spur-k8s/src/main.rs
+++ b/crates/spur-k8s/src/main.rs
@@ -29,6 +29,13 @@ struct Args {
     #[arg(long, default_value = "[::]:6818")]
     listen: String,
 
+    /// Advertised address for spurctld to reach this operator.
+    /// If unset, falls back to POD_IP env var, then hostname.
+    /// In K8s, set this to the Service DNS name or use the Downward API
+    /// to inject the Pod IP (issue #51).
+    #[arg(long, env = "SPUR_OPERATOR_ADDRESS")]
+    address: Option<String>,
+
     /// K8s namespace for SpurJobs and Pods
     #[arg(long, default_value = "spur")]
     namespace: String,
@@ -82,54 +89,66 @@ async fn main() -> anyhow::Result<()> {
     let client = Client::try_default().await?;
 
     let listen_addr: SocketAddr = args.listen.parse()?;
-    let operator_ip = if listen_addr.ip().is_unspecified() {
+    // Issue #51: Use explicit --address flag, then POD_IP env var (K8s Downward
+    // API), then listen IP, then hostname. Pod hostnames are unroutable —
+    // spurctld can't reach the operator at "spur-k8s-operator-abc123".
+    let operator_ip = if let Some(ref addr) = args.address {
+        addr.clone()
+    } else if let Ok(pod_ip) = std::env::var("POD_IP") {
+        pod_ip
+    } else if !listen_addr.ip().is_unspecified() {
+        listen_addr.ip().to_string()
+    } else {
         hostname::get()
             .map(|h| h.to_string_lossy().to_string())
             .unwrap_or_else(|_| "127.0.0.1".into())
-    } else {
-        listen_addr.ip().to_string()
     };
     let operator_port = listen_addr.port() as u32;
+    info!(address = %operator_ip, port = operator_port, "operator will advertise this address to spurctld");
 
-    // Spawn health/readiness server
+    // Spawn health/readiness server (issue #52: retry on failure)
     let health_addr: SocketAddr = args.health_addr.parse()?;
     let health_ctrl_addr = args.controller_addr.clone();
     let health_client = client.clone();
     tokio::spawn(async move {
-        if let Err(e) = health::serve(health_addr, health_client, health_ctrl_addr).await {
-            tracing::error!(error = %e, "health server exited");
-        }
+        run_with_retry("health server", || {
+            let c = health_client.clone();
+            let addr = health_ctrl_addr.clone();
+            Box::pin(health::serve(health_addr, c, addr))
+        })
+        .await;
     });
 
-    // Spawn node watcher
+    // Spawn node watcher (issue #52: retry on failure)
     let nw_client = client.clone();
     let nw_ctrl_addr = args.controller_addr.clone();
     let nw_op_addr = operator_ip.clone();
     let nw_ns = args.namespace.clone();
     let nw_selector = args.node_selector.clone();
     tokio::spawn(async move {
-        if let Err(e) = node_watcher::run(
-            nw_client,
-            nw_ctrl_addr,
-            nw_op_addr,
-            operator_port,
-            nw_ns,
-            nw_selector,
-        )
-        .await
-        {
-            tracing::error!(error = %e, "node watcher exited");
-        }
+        run_with_retry("node watcher", || {
+            let c = nw_client.clone();
+            let ctrl = nw_ctrl_addr.clone();
+            let op = nw_op_addr.clone();
+            let ns = nw_ns.clone();
+            let sel = nw_selector.clone();
+            Box::pin(node_watcher::run(c, ctrl, op, operator_port, ns, sel))
+        })
+        .await;
     });
 
-    // Spawn job controller
+    // Spawn job controller (issue #52: retry on failure)
     let jc_client = client.clone();
     let jc_ctrl_addr = args.controller_addr.clone();
     let jc_ns = args.namespace.clone();
     tokio::spawn(async move {
-        if let Err(e) = job_controller::run(jc_client, jc_ctrl_addr, jc_ns).await {
-            tracing::error!(error = %e, "job controller exited");
-        }
+        run_with_retry("job controller", || {
+            let c = jc_client.clone();
+            let ctrl = jc_ctrl_addr.clone();
+            let ns = jc_ns.clone();
+            Box::pin(job_controller::run(c, ctrl, ns))
+        })
+        .await;
     });
 
     // Start virtual agent gRPC server
@@ -144,6 +163,33 @@ async fn main() -> anyhow::Result<()> {
     Ok(())
 }
 
+/// Run an async task with exponential backoff retry on failure (issue #52).
+///
+/// If the task exits with an error, it is restarted after a delay that doubles
+/// each time (1s → 2s → 4s → ... → 60s max). On success the backoff resets.
+async fn run_with_retry<F, Fut>(name: &str, mut factory: F) -> !
+where
+    F: FnMut() -> std::pin::Pin<Box<Fut>>,
+    Fut: std::future::Future<Output = anyhow::Result<()>> + Send + 'static,
+{
+    let mut backoff = std::time::Duration::from_secs(1);
+    let max_backoff = std::time::Duration::from_secs(60);
+
+    loop {
+        match factory().await {
+            Ok(()) => {
+                tracing::warn!(%name, "task exited cleanly, restarting");
+                backoff = std::time::Duration::from_secs(1);
+            }
+            Err(e) => {
+                tracing::error!(%name, error = %e, backoff_secs = backoff.as_secs(), "task failed, retrying");
+                tokio::time::sleep(backoff).await;
+                backoff = std::cmp::min(backoff * 2, max_backoff);
+            }
+        }
+    }
+}
+
 fn generate_crd() {
     use kube::CustomResourceExt;
     let crd = crd::SpurJob::crd();
diff --git a/crates/spur-sched/src/backfill.rs b/crates/spur-sched/src/backfill.rs
index f350810..acb5416 100644
--- a/crates/spur-sched/src/backfill.rs
+++ b/crates/spur-sched/src/backfill.rs
@@ -239,7 +239,7 @@ impl Scheduler for BackfillScheduler {
 
             let required = job_resource_request(job);
             let duration = job.spec.time_limit.unwrap_or(Duration::hours(1));
-            let needed_nodes = job.spec.num_nodes as usize;
+            let needed_nodes = (job.spec.num_nodes as usize).max(1);
 
             // Find earliest start across needed_nodes
             let mut node_starts: Vec<(usize, chrono::DateTime<Utc>)> = suitable
diff --git a/crates/spur-tests/src/t07_sched.rs b/crates/spur-tests/src/t07_sched.rs
index 12e7f41..f5c1093 100644
--- a/crates/spur-tests/src/t07_sched.rs
+++ b/crates/spur-tests/src/t07_sched.rs
@@ -621,4 +621,100 @@ mod tests {
         let assignments = sched.schedule(&[job], &cluster);
         assert_eq!(assignments.len(), 1);
     }
+
+    // ── Issue #56: edge cases that could crash the scheduler ─────
+
+    #[test]
+    fn t07_27_num_nodes_zero_does_not_panic() {
+        // Issue #56: A job with num_nodes=0 should be handled safely
+        // instead of panicking on .max().unwrap() with empty iterator.
+        reset_job_ids();
+        let mut sched = BackfillScheduler::new(100);
+        let nodes = make_nodes(2, 64, 256_000);
+        let partitions = vec![make_partition("default", 2)];
+        let mut job = make_job("zero-nodes");
+        job.spec.num_nodes = 0;
+
+        let cluster = ClusterState {
+            nodes: &nodes,
+            partitions: &partitions,
+            reservations: &[],
+        };
+        // Must not panic — should schedule with 1 node (the minimum)
+        let assignments = sched.schedule(&[job], &cluster);
+        assert_eq!(assignments.len(), 1);
+        assert_eq!(assignments[0].nodes.len(), 1);
+    }
+
+    #[test]
+    fn t07_28_single_idle_node_schedules_immediately() {
+        // Issue #56 regression: A single idle node with a single pending
+        // job should result in immediate scheduling (no Reason=Priority).
+        reset_job_ids();
+        let mut sched = BackfillScheduler::new(100);
+        let nodes = make_nodes(1, 64, 256_000);
+        let partitions = vec![make_partition("default", 1)];
+        let job = make_job("simple");
+
+        let cluster = ClusterState {
+            nodes: &nodes,
+            partitions: &partitions,
+            reservations: &[],
+        };
+        let assignments = sched.schedule(&[job], &cluster);
+        assert_eq!(
+            assignments.len(),
+            1,
+            "single idle node should schedule job immediately"
+        );
+        assert_eq!(assignments[0].nodes[0], "node001");
+    }
+
+    #[test]
+    fn t07_29_constraint_mismatch_not_scheduled() {
+        // Issue #56: A job with --constraint=gpu should NOT be scheduled
+        // on a node without the "gpu" feature.
+        reset_job_ids();
+        let mut sched = BackfillScheduler::new(100);
+        let nodes = make_nodes(2, 64, 256_000); // nodes have NO features
+        let partitions = vec![make_partition("default", 2)];
+        let mut job = make_job("gpu-job");
+        job.spec.constraint = Some("gpu".into());
+
+        let cluster = ClusterState {
+            nodes: &nodes,
+            partitions: &partitions,
+            reservations: &[],
+        };
+        let assignments = sched.schedule(&[job], &cluster);
+        assert_eq!(
+            assignments.len(),
+            0,
+            "job requiring gpu feature should not schedule on featureless nodes"
+        );
+    }
+
+    #[test]
+    fn t07_30_exclusive_job_needs_idle_node() {
+        // Issue #56: An exclusive job should only schedule on a node
+        // with zero current allocations.
+        reset_job_ids();
+        let mut sched = BackfillScheduler::new(100);
+        let mut nodes = make_nodes(2, 64, 256_000);
+        // Node 1 has partial allocations
+        nodes[0].alloc_resources.cpus = 32;
+        let partitions = vec![make_partition("default", 2)];
+        let mut job = make_job("exclusive");
+        job.spec.exclusive = true;
+
+        let cluster = ClusterState {
+            nodes: &nodes,
+            partitions: &partitions,
+            reservations: &[],
+        };
+        let assignments = sched.schedule(&[job], &cluster);
+        assert_eq!(assignments.len(), 1);
+        // Should land on node002 (the idle one), not node001 (partially allocated)
+        assert_eq!(assignments[0].nodes[0], "node002");
+    }
 }
diff --git a/crates/spur-tests/src/t50_core.rs b/crates/spur-tests/src/t50_core.rs
index de1e40e..fdeeeb3 100644
--- a/crates/spur-tests/src/t50_core.rs
+++ b/crates/spur-tests/src/t50_core.rs
@@ -1104,4 +1104,247 @@ address = "http://peer-a:6817"
         assert_eq!(sanitized, expected);
         assert_eq!(format!("{}.sqsh", sanitized), basename);
     }
+
+    // ── Issue #56 (reopen #47): scheduler crash recovery ─────────
+
+    #[test]
+    fn t50_96_scheduler_handles_num_nodes_zero_safely() {
+        // Issue #56: If num_nodes is somehow 0, the scheduler should not
+        // panic on .max().unwrap() with an empty iterator.
+        // The fix ensures num_nodes.max(1) is used in the scheduling loop.
+        use spur_sched::traits::Scheduler;
+        let mut sched = spur_sched::backfill::BackfillScheduler::new(100);
+        let nodes = vec![{
+            let mut n = Node::new(
+                "node001".into(),
+                ResourceSet {
+                    cpus: 64,
+                    memory_mb: 256_000,
+                    ..Default::default()
+                },
+            );
+            n.state = spur_core::node::NodeState::Idle;
+            n.partitions = vec!["default".into()];
+            n
+        }];
+        let partitions = vec![make_partition("default", 1)];
+
+        // Create a job with num_nodes=0 (edge case)
+        let mut job = make_job("edge-case");
+        job.spec.num_nodes = 0;
+
+        let cluster = spur_sched::traits::ClusterState {
+            nodes: &nodes,
+            partitions: &partitions,
+            reservations: &[],
+        };
+        // This should NOT panic
+        let assignments = sched.schedule(&[job], &cluster);
+        // With num_nodes.max(1), the job should still get scheduled
+        assert_eq!(assignments.len(), 1);
+    }
+
+    // ── Issue #56: update_pending_reasons checks constraints ─────
+
+    #[test]
+    fn t50_97_pending_reason_respects_constraints() {
+        // Issue #56: A job with --constraint=gpu should show
+        // Reason=Resources (not Priority) when no node has that feature.
+        let _reason_with_features: () = {
+            let mut node = Node::new(
+                "node001".into(),
+                ResourceSet {
+                    cpus: 64,
+                    memory_mb: 256_000,
+                    ..Default::default()
+                },
+            );
+            node.state = spur_core::node::NodeState::Idle;
+            // Node has NO features
+
+            // Job requires "gpu" feature
+            let mut job = make_job("constrained");
+            job.spec.constraint = Some("gpu".into());
+
+            // The node is schedulable and can satisfy resources,
+            // but doesn't have the constraint feature.
+            // update_pending_reasons should set Resources, not Priority.
+            let is_capable = node.is_schedulable()
+                && node.total_resources.can_satisfy(&ResourceSet {
+                    cpus: 1,
+                    memory_mb: 0,
+                    ..Default::default()
+                });
+            // Old behavior: this would be true → Priority (misleading)
+            assert!(is_capable, "basic capability check passes");
+
+            // New behavior: constraint check should reject the node
+            let constraint = job.spec.constraint.as_deref().unwrap();
+            let features: Vec<&str> = constraint.split(',').map(str::trim).collect();
+            let passes_constraint = features
+                .iter()
+                .all(|f| node.features.contains(&f.to_string()));
+            assert!(
+                !passes_constraint,
+                "constraint check should reject node without gpu feature"
+            );
+        };
+    }
+
+    // ── Issue #55 (reopen): agent image_dir fallback ─────────────
+
+    #[test]
+    fn t50_98_image_dir_fallback_logic() {
+        // Issue #55: The agent's image_dir() should fall back to
+        // ~/.spur/images when /var/spool/spur/images doesn't exist.
+        // This test verifies the fallback logic conceptually.
+        let system_dir = std::path::Path::new("/var/spool/spur/images");
+        let home = std::env::var_os("HOME");
+
+        // If system dir doesn't exist and HOME is set, fallback should
+        // point to ~/.spur/images
+        if !system_dir.is_dir() {
+            if let Some(home) = home {
+                let expected = std::path::PathBuf::from(home).join(".spur/images");
+                // Verify the path construction is correct
+                assert!(expected.to_str().unwrap().contains(".spur/images"));
+            }
+        }
+    }
+
+    // ── Issue #54 (reopen): sattach buffer sizes ─────────────────
+
+    #[test]
+    fn t50_99_attach_job_messages_support_raw_bytes() {
+        // Issue #54: AttachJobInput should carry raw bytes (not just
+        // newline-terminated lines) for interactive use.
+        let input = spur_proto::proto::AttachJobInput {
+            job_id: 42,
+            data: vec![0x1b, 0x5b, 0x41], // ESC [ A (arrow up)
+        };
+        assert_eq!(input.data.len(), 3);
+        assert_eq!(input.data[0], 0x1b); // ESC byte
+
+        let output = spur_proto::proto::AttachJobOutput {
+            data: vec![0x1b, 0x5b, 0x48], // ESC [ H (cursor home)
+            eof: false,
+        };
+        assert_eq!(output.data.len(), 3);
+        assert!(!output.eof);
+    }
+
+    // ── Issue #53: CLI show dispatch ─────────────────────────────
+
+    #[test]
+    fn t50_100_show_dispatch_inserts_implicit_show() {
+        // Issue #53: `spur show node X` should dispatch as
+        // `scontrol show node X`, not `scontrol node X`.
+        //
+        // Simulate the argv rewriting logic from main.rs.
+        let args: Vec<String> = vec!["spur".into(), "show".into(), "node".into(), "gpu-1".into()];
+
+        let cmd = "scontrol";
+        let implicit_show = args[1].as_str() == "show" && cmd == "scontrol";
+
+        let rewritten: Vec<String> = std::iter::once(cmd.to_string())
+            .chain(if implicit_show {
+                vec!["show".to_string()]
+            } else {
+                vec![]
+            })
+            .chain(args[2..].iter().cloned())
+            .collect();
+
+        assert_eq!(rewritten, vec!["scontrol", "show", "node", "gpu-1"]);
+    }
+
+    #[test]
+    fn t50_101_control_dispatch_no_implicit_show() {
+        // `spur control show node X` should NOT insert extra show.
+        let args: Vec<String> = vec![
+            "spur".into(),
+            "control".into(),
+            "show".into(),
+            "node".into(),
+            "gpu-1".into(),
+        ];
+
+        let cmd = "scontrol";
+        let implicit_show = args[1].as_str() == "show" && cmd == "scontrol";
+
+        let rewritten: Vec<String> = std::iter::once(cmd.to_string())
+            .chain(if implicit_show {
+                vec!["show".to_string()]
+            } else {
+                vec![]
+            })
+            .chain(args[2..].iter().cloned())
+            .collect();
+
+        // "control" != "show", so no implicit show inserted
+        assert_eq!(rewritten, vec!["scontrol", "show", "node", "gpu-1"]);
+    }
+
+    // ── Issue #51: K8s operator address resolution ───────────────
+
+    #[test]
+    fn t50_102_k8s_address_resolution_priority() {
+        // Issue #51: The operator should prefer --address flag over
+        // POD_IP env var over hostname. Verify the priority logic.
+        //
+        // Test the priority: explicit > POD_IP > listen IP > hostname
+        let explicit = Some("10.0.0.1".to_string());
+        let pod_ip: Result<String, std::env::VarError> = Ok("10.0.0.2".to_string());
+        let listen_is_unspecified = true;
+
+        // Explicit wins
+        let result = if let Some(ref addr) = explicit {
+            addr.clone()
+        } else if let Ok(ip) = pod_ip.as_ref() {
+            ip.clone()
+        } else if !listen_is_unspecified {
+            "10.0.0.3".into()
+        } else {
+            "pod-hostname-abc123".into()
+        };
+        assert_eq!(result, "10.0.0.1");
+    }
+
+    #[test]
+    fn t50_103_k8s_pod_ip_fallback() {
+        // When no explicit address, POD_IP should be used
+        let explicit: Option<String> = None;
+        let pod_ip: Result<String, ()> = Ok("10.244.1.5".to_string());
+        let listen_is_unspecified = true;
+
+        let result = if let Some(ref addr) = explicit {
+            addr.clone()
+        } else if let Ok(ip) = pod_ip.as_ref() {
+            ip.clone()
+        } else if !listen_is_unspecified {
+            "10.0.0.3".into()
+        } else {
+            "pod-hostname-abc123".into()
+        };
+        assert_eq!(result, "10.244.1.5");
+    }
+
+    // ── Issue #52: retry loop backoff ────────────────────────────
+
+    #[test]
+    fn t50_104_retry_backoff_doubles_then_caps() {
+        // Issue #52: verify exponential backoff logic caps at 60s.
+        let max_backoff = std::time::Duration::from_secs(60);
+        let mut backoff = std::time::Duration::from_secs(1);
+
+        // Simulate 10 failures
+        for _ in 0..10 {
+            backoff = std::cmp::min(backoff * 2, max_backoff);
+        }
+        assert_eq!(backoff, max_backoff);
+
+        // Simulate success resets
+        backoff = std::time::Duration::from_secs(1);
+        assert_eq!(backoff.as_secs(), 1);
+    }
 }
diff --git a/crates/spurctld/src/cluster.rs b/crates/spurctld/src/cluster.rs
index 787e0a1..ae27e23 100644
--- a/crates/spurctld/src/cluster.rs
+++ b/crates/spurctld/src/cluster.rs
@@ -1268,14 +1268,45 @@ impl ClusterManager {
             }
 
             // Nodes exist but may be fully allocated — check if any idle node
-            // can satisfy resource requirements
+            // can satisfy resource requirements.
+            //
+            // This must mirror the checks in BackfillScheduler::find_suitable_nodes
+            // so that the displayed reason accurately reflects why the scheduler
+            // can't place the job (issue #56 — reopened #47).
             let required = spur_sched::backfill::job_resource_request(job);
-            let has_capable_node = nodes_in_partition
-                .iter()
-                .any(|n| n.is_schedulable() && n.total_resources.can_satisfy(&required));
+            let has_capable_node = nodes_in_partition.iter().any(|n| {
+                if !n.is_schedulable() {
+                    return false;
+                }
+                // Skip nodes fully consumed by existing allocations
+                if n.alloc_resources.cpus >= n.total_resources.cpus && n.total_resources.cpus > 0 {
+                    return false;
+                }
+                // Exclusive job needs an idle node (no current allocations)
+                if job.spec.exclusive
+                    && (n.alloc_resources.cpus > 0 || !n.alloc_resources.gpus.is_empty())
+                {
+                    return false;
+                }
+                // Constraint feature check
+                if let Some(ref constraint) = job.spec.constraint {
+                    let required_features: Vec<&str> = constraint
+                        .split(',')
+                        .map(str::trim)
+                        .filter(|s| !s.is_empty())
+                        .collect();
+                    if !required_features
+                        .iter()
+                        .all(|f| n.features.contains(&f.to_string()))
+                    {
+                        return false;
+                    }
+                }
+                n.total_resources.can_satisfy(&required)
+            });
 
             if !has_capable_node {
-                // Resources insufficient on all nodes
+                // Resources insufficient or constraints prevent scheduling
                 job_entry.pending_reason = PendingReason::Resources;
             } else {
                 // Capable nodes exist but currently occupied — backfill will
diff --git a/crates/spurctld/src/scheduler_loop.rs b/crates/spurctld/src/scheduler_loop.rs
index caade06..56fc1c6 100644
--- a/crates/spurctld/src/scheduler_loop.rs
+++ b/crates/spurctld/src/scheduler_loop.rs
@@ -63,7 +63,24 @@ pub async fn run(cluster: Arc<ClusterManager>) {
             reservations: &reservations,
         };
 
-        let assignments = scheduler.schedule(&pending, &cluster_state);
+        // Catch panics in the scheduler so that a single bad job doesn't kill
+        // the entire scheduling loop (issue #56).
+        let sched_ref = &mut scheduler;
+        let assignments = match std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
+            sched_ref.schedule(&pending, &cluster_state)
+        })) {
+            Ok(a) => a,
+            Err(e) => {
+                error!(
+                    "scheduler panicked: {:?} — skipping cycle",
+                    e.downcast_ref::<String>()
+                        .map(|s| s.as_str())
+                        .or_else(|| e.downcast_ref::<&str>().copied())
+                        .unwrap_or("unknown")
+                );
+                continue;
+            }
+        };
 
         // Preemption: if high-priority jobs couldn't be scheduled,
         // cancel lower-priority running jobs to free resources.
diff --git a/crates/spurd/src/agent_server.rs b/crates/spurd/src/agent_server.rs
index d278aaf..0487e28 100644
--- a/crates/spurd/src/agent_server.rs
+++ b/crates/spurd/src/agent_server.rs
@@ -869,7 +869,9 @@ impl SlurmAgent for AgentService {
             }
         };
 
-        let (tx, rx) = tokio::sync::mpsc::channel::<Result<AttachJobOutput, Status>>(32);
+        // Issue #54: Use a larger buffer to prevent deadlock when stdout+stderr
+        // produce high-volume output concurrently.
+        let (tx, rx) = tokio::sync::mpsc::channel::<Result<AttachJobOutput, Status>>(256);
 
         tokio::spawn(async move {
             // Spawn an interactive shell inside the job's cgroup/namespace
@@ -1002,10 +1004,16 @@ impl SlurmAgent for AgentService {
                 }
             }
 
-            // Wait for child to exit
+            // Wait for child to exit, then let I/O tasks drain gracefully
+            // before sending EOF. Aborting immediately loses buffered data
+            // (issue #54).
             let _ = child.wait().await;
+            // Give tasks a moment to flush remaining data
+            let _ = tokio::time::timeout(std::time::Duration::from_secs(2), async {
+                let _ = stderr_task.await;
+            })
+            .await;
             stdin_task.abort();
-            stderr_task.abort();
 
             // Send EOF
             let _ = tx_clone
diff --git a/crates/spurd/src/container.rs b/crates/spurd/src/container.rs
index 2b4103b..e6b36f6 100644
--- a/crates/spurd/src/container.rs
+++ b/crates/spurd/src/container.rs
@@ -25,13 +25,25 @@ const CONTAINER_DIR: &str = "/var/spool/spur/containers";
 ///
 /// This must match the CLI's `resolve_image_dir()` logic so that images
 /// imported via `spur image import` are found by the agent at job launch.
+///
+/// Priority (issue #55 — matches CLI's 3-tier fallback):
+/// 1. `$SPUR_IMAGE_DIR` environment variable
+/// 2. `/var/spool/spur/images` if it exists
+/// 3. `~/.spur/images/` as user-local fallback
 fn image_dir() -> PathBuf {
     if let Ok(dir) = std::env::var("SPUR_IMAGE_DIR") {
         if !dir.is_empty() {
             return PathBuf::from(dir);
         }
     }
-    PathBuf::from(DEFAULT_IMAGE_DIR)
+    let system_dir = Path::new(DEFAULT_IMAGE_DIR);
+    if system_dir.is_dir() {
+        return system_dir.to_path_buf();
+    }
+    if let Some(home) = std::env::var_os("HOME") {
+        return PathBuf::from(home).join(".spur/images");
+    }
+    system_dir.to_path_buf()
 }
 
 /// A parsed bind mount specification.