From f035a233c3a0c06e6b8b1d74335cc82f05c454ce Mon Sep 17 00:00:00 2001 From: "Cliff L. Biffle" Date: Tue, 24 Feb 2026 13:55:14 -0800 Subject: [PATCH 1/2] hiffy: improve task synchronization after reset Manufacturing test revealed an occasional flake in programming auxflash on Minibar. It turns out to be a multi-part problem. We program auxflash by driving Hiffy through Humility to generate IPCs. This means the flash process is 1. Write firmware to the SP's internal flash. 2. Reboot into it. 3. Enlist the hiffy task to check and program auxflash. 4. Reboot again. Humility synchronizes with Hiffy by watching for a `HIFFY_READY` variable to go from 0->1. Unfortunately, if the hiffy task faults or the system reboots, nothing switches that back to 0 until hiffy's bss initialization code runs. It turns out that Minibar has several high priority tasks that use CPU at boot, delaying hiffy's task initialization, including bss. This means that, after a reboot, Humility can observe arbitrary contents in the `HIFFY_READY` variable for quite a long period of time (about 42ms in my tests). In this specific case, it's usually a `1` left there by the previous incarnation of the system... but it could also be random data, if the last image we were running had Hiffy at a different physical address. (Yes, this is an uninitialized variable bug despite being all safe Rust. Debuggers: Always Unsafe(tm)) Fixing this has turned out to be a little subtle. The approach taken by this commit is to "detect" the completion of Hiffy's task initialization by seeing it enter into a system call. Since Hiffy spends 99+% of its time parked in sleeps, this is a reliable indicator that control has gotten to main and the contents of `HIFFY_READY` are now initialized. --- Cargo.lock | 1 + humility-hiffy/Cargo.toml | 1 + humility-hiffy/src/lib.rs | 118 ++++++++++++++++++++++++++++++++++---- 3 files changed, 108 insertions(+), 12 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 22645261..4f97b99f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2435,6 +2435,7 @@ dependencies = [ "humility-doppel", "humility-idol", "idol", + "log", "parse_int", "postcard", "zerocopy", diff --git a/humility-hiffy/Cargo.toml b/humility-hiffy/Cargo.toml index b48bac1a..037ddca8 100644 --- a/humility-hiffy/Cargo.toml +++ b/humility-hiffy/Cargo.toml @@ -14,3 +14,4 @@ zerocopy.workspace = true humility.workspace = true humility-doppel.workspace = true humility-idol.workspace = true +log.workspace = true diff --git a/humility-hiffy/src/lib.rs b/humility-hiffy/src/lib.rs index eebed399..6ae69854 100644 --- a/humility-hiffy/src/lib.rs +++ b/humility-hiffy/src/lib.rs @@ -9,7 +9,7 @@ use hif::*; use humility::core::{Core, NetAgent}; use humility::hubris::*; use humility::reflect::{self, Load, Value}; -use humility_doppel::{RpcHeader, StaticCell}; +use humility_doppel::{RpcHeader, SchedState, StaticCell, TaskState}; use humility_idol as idol; pub use humility_idol::IpcError; use postcard::{take_from_bytes, to_slice}; @@ -936,27 +936,79 @@ impl<'a> HiffyContext<'a> { ); } - let mut text: Vec = vec![]; - text.resize_with(self.text.size, Default::default); + let mut text = vec![0u8; self.text.size]; core.op_start()?; // // We now want to loop until we know that the HIF facility is - // available. Generally, if we see that HIFFY_READY is 0, it is - // because our in situ cohort either never ran, or has not checked - // back in (i.e., it is wedged on an earlier call). But it's also - // conceivable that we have caught it in the very small window when it - // is awake, but only to check if it has been kicked; to differentiate - // these cases, we back off for a linearly increasing number of - // milliseconds until we conclude that the facility is alive, or that - // we have tried too many times. // + // 1. During startup of the hiffy task, as bss is initialized. + // 2. Periodically when hiffy wakes up to check for instructions. + // + // If we see HIFFY_READY==0, it may mean that we've observed it at one + // of those two windows, which generally means that we should wait and + // try again -- giving hiffy time to finish initializing, or to go back + // to sleep and wait for instructions, respectively. + // + // However -- if HIFFY_READY==1, things are more complicated. + // + // The hiffy task sets HIFFY_READY==1 when it sleeps to allow the + // debugger (hello!) to deliver new instructions. So far so good. + // + // If the system under test (or just the hiffy task) is _restarted,_ + // nothing clears HIFFY_READY until bss initialization sets it to 0. + // For that to happen, the hiffy task has to get some CPU time. Because + // hiffy is pretty low priority, this can take a while if other system + // startup things are expensive. + // + // So we need some sort of preflight check to verify that we can trust + // the contents of HIFFY_READY, and we do this by checking to see if the + // hiffy task is in a syscall, since all syscalls happen _after_ task + // initialization. + // + let hiffy_task = self.hubris.lookup_task("hiffy").and_then(|t| { + match t { + // This really shouldn't happen, but it's permitted by the API + // types, sooooooo + HubrisTask::Kernel => { + log::warn!( + "This application appears to have named its kernel \ + 'hiffy', which is very funny ha ha but will make \ + IPC operations and other things using hiffy less \ + reliable. You should not be seeing this message, \ + please report it to whoever's responsible for this \ + firmware. (If it's you, my condolences.)" + ); + + None + } + + HubrisTask::Task(i) => Some(*i), + } + }); + + let mut syscall_observed = false; + if hiffy_task.is_none() { + log::warn!( + "Can't find task named 'hiffy' in image. We'll still try to \ + use it, but if this is just after a reboot, we may not be \ + able to synchronize with its startup correctly, which may \ + produce intermittent flaky behavior. This message is not \ + expected in normal firmware." + ); + syscall_observed = true; + } + let mut lap = 0; const MAX_LAPS: u64 = 10; let ready = loop { - if core.read_word_32(self.ready.addr)? == 1 { + if !syscall_observed { + if has_task_started(self.hubris, core, hiffy_task.unwrap())? { + syscall_observed = true; + } + } else if core.read_word_32(self.ready.addr)? == 1 { break true; } @@ -1206,6 +1258,48 @@ impl<'a> HiffyContext<'a> { } } +/// Performs a somewhat probabilistic check for whether a task has _started,_ by +/// which we specifically mean "run for long enough to start executing code in +/// main." +/// +/// Running this enough times to get `true` in response means it's safe to +/// access initialized memory in the task. +fn has_task_started( + hubris: &HubrisArchive, + core: &mut dyn Core, + task_index: u32, +) -> Result { + let task_t = hubris.lookup_struct_byname("Task")?; + + let (base, _) = hubris.task_table(core)?; + + let addr = base + (task_index * task_t.size as u32); + let mut buffer = vec![0; task_t.size]; + core.read_8(addr, &mut buffer)?; + + let task: humility_doppel::Task = + reflect::load(hubris, &buffer, task_t, 0)?; + // How can we tell that a task has finished pre-main initialization? Let me + // count the ways... + match task.state { + TaskState::Healthy(ss) => match ss { + // Tasks don't issue IPCs during pre-main, so if the task is in + // SEND, REPLY, or RECV, it has by definition gotten into main. + SchedState::InSend(_) + | SchedState::InReply(_) + | SchedState::InRecv(_) => Ok(true), + // A task may be stopped or "runnable"/"running" before main, so + // from this information alone, we can't tell one way or the other. + SchedState::Stopped | SchedState::Runnable => Ok(false), + }, + // A badly configured task could hit a fault before main by initializing + // memory badly, or the supervisor could deliver a fault before the task + // has a chance to run. Either way, this state interferes with our + // ability to reliably detect startup. + TaskState::Faulted { .. } => Ok(false), + } +} + /// Executes a Hiffy call, printing the output to the terminal /// /// Returns an outer error if Hiffy communication fails, or an inner error From 5eff8618276a058d5fc61da9ffdb40895f18a421 Mon Sep 17 00:00:00 2001 From: "Cliff L. Biffle" Date: Fri, 27 Feb 2026 16:14:11 -0800 Subject: [PATCH 2/2] version to 0.12.13 (hiffy sync change) --- Cargo.lock | 2 +- humility-bin/Cargo.toml | 2 +- humility-bin/tests/cmd/chip.trycmd | 4 ++-- humility-bin/tests/cmd/version.trycmd | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4f97b99f..34508414 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1291,7 +1291,7 @@ dependencies = [ [[package]] name = "humility-bin" -version = "0.12.12" +version = "0.12.13" dependencies = [ "anyhow", "bitfield", diff --git a/humility-bin/Cargo.toml b/humility-bin/Cargo.toml index 9ecd0cf1..b16ff272 100644 --- a/humility-bin/Cargo.toml +++ b/humility-bin/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "humility-bin" edition.workspace = true -version = "0.12.12" +version = "0.12.13" license = "MPL-2.0" [build-dependencies] diff --git a/humility-bin/tests/cmd/chip.trycmd b/humility-bin/tests/cmd/chip.trycmd index 84b571ae..817ba4f8 100644 --- a/humility-bin/tests/cmd/chip.trycmd +++ b/humility-bin/tests/cmd/chip.trycmd @@ -13,7 +13,7 @@ For more information try --help ``` $ humility --chip this-can-be-anything -V -humility 0.12.12 +humility 0.12.13 ``` @@ -28,7 +28,7 @@ For more information try --help ``` $ humility -c apx432 -V -humility 0.12.12 +humility 0.12.13 ``` diff --git a/humility-bin/tests/cmd/version.trycmd b/humility-bin/tests/cmd/version.trycmd index 722b2bfa..49685fc3 100644 --- a/humility-bin/tests/cmd/version.trycmd +++ b/humility-bin/tests/cmd/version.trycmd @@ -2,7 +2,7 @@ Long version flag: ``` $ humility --version -humility 0.12.12 +humility 0.12.13 ``` @@ -10,6 +10,6 @@ Short version flag: ``` $ humility -V -humility 0.12.12 +humility 0.12.13 ```