From 351d1838d1fb084d1a7032ec05dc1e167ff2778b Mon Sep 17 00:00:00 2001 From: Aaron-Hartwig Date: Sun, 22 Feb 2026 20:14:17 -0600 Subject: [PATCH 1/4] sidecar-seq: re-sequence on bad PCIe link --- .../src/tofino2.rs | 5 ++ drv/sidecar-seq-server/src/main.rs | 50 +++++++++++++++++++ drv/sidecar-seq-server/src/tofino.rs | 17 +++++++ 3 files changed, 72 insertions(+) diff --git a/drv/sidecar-mainboard-controller/src/tofino2.rs b/drv/sidecar-mainboard-controller/src/tofino2.rs index 009fe39f37..8627b5fde4 100644 --- a/drv/sidecar-mainboard-controller/src/tofino2.rs +++ b/drv/sidecar-mainboard-controller/src/tofino2.rs @@ -432,6 +432,11 @@ impl Sequencer { pub fn pcie_hotplug_status(&self) -> Result { self.fpga.read(Addr::PCIE_HOTPLUG_STATUS) } + + pub fn is_pcie_reset(&self) -> Result { + let status: u8 = self.pcie_hotplug_status()?; + Ok(status & Reg::PCIE_HOTPLUG_STATUS::HOST_RESET != 0) + } } bitfield! { diff --git a/drv/sidecar-seq-server/src/main.rs b/drv/sidecar-seq-server/src/main.rs index 170a977b2c..9ffc73e8d6 100644 --- a/drv/sidecar-seq-server/src/main.rs +++ b/drv/sidecar-seq-server/src/main.rs @@ -81,6 +81,8 @@ enum Trace { TofinoCfgRegisterValue(TofinoCfgRegisters, u32), TofinoPowerUp, TofinoPowerDown, + TofinoResequence, + TofinoPcieReset(bool), SetVddCoreVout(userlib::units::Volts), SetPCIePresent, ClearPCIePresent, @@ -169,6 +171,10 @@ struct ServerImpl { // a piece of state to allow blinking LEDs to be in phase led_blink_on: bool, sys: sys_api::Sys, + // used to track how many notification loops elapsed while in A0 without a + // PCIe link + no_pcie_count: u8, + resequenced: bool, } impl ServerImpl { @@ -376,6 +382,38 @@ impl ServerImpl { Err(e) => Err(e), } } + + // Monitor the status of Tofino's PCIe link to the host. When a host is + // connected and has booted, PERST will be deasserted and link will train. + // This worked reliably on Gimlet, but for reasons we've not yet been able + // to identify it can fail on Cosmo (host thinks things are fine, Tofino + // does not). When this happens, resequencing the Tofino reliably + // establishes the link. So we will monitor if we think there should be + // a PCIe link or not, and if we think there should be one but the Tofino + // thinks it is down we will resequence the Tofino after some delay. + fn monitor_tofino_pcie_link(&mut self) -> Result<(), SeqError> { + if !self + .tofino + .sequencer + .is_pcie_reset() + .map_err(|_| SeqError::FpgaError)? + && !self.tofino.pcie_link_up()? + && !self.resequenced + { + self.no_pcie_count += 1; + + if self.no_pcie_count >= 30 { + // We have failed to establish a link, resequence. + ringbuf_entry!(Trace::TofinoResequence); + self.tofino.power_down()?; + self.tofino.power_up()?; + self.resequenced = true; + } + } else { + self.no_pcie_count = 0; + } + Ok(()) + } } impl idl::InOrderSequencerImpl for ServerImpl { @@ -825,6 +863,16 @@ impl NotificationHandler for ServerImpl { // Fan module monitoring pulled out to keep this loop readable self.monitor_fan_modules(); + // Monitor Tofino PCIe Link + self.tofino.poll_pcie_reset(); + if self.tofino.sequencer.state().unwrap_or(TofinoSeqState::A2) + == TofinoSeqState::A0 + { + if let Err(e) = self.monitor_tofino_pcie_link() { + ringbuf_entry!(Trace::TofinoSequencerError(e)); + } + } + let finish = sys_get_timer().now; // We now know when we were notified and when any work was completed. @@ -911,6 +959,8 @@ fn main() -> ! { fan_modules, led_blink_on: false, sys, + no_pcie_count: 0, + resequenced: false, }; ringbuf_entry!(Trace::FpgaInit); diff --git a/drv/sidecar-seq-server/src/tofino.rs b/drv/sidecar-seq-server/src/tofino.rs index f5fca52a14..16293c7cee 100644 --- a/drv/sidecar-seq-server/src/tofino.rs +++ b/drv/sidecar-seq-server/src/tofino.rs @@ -13,6 +13,7 @@ pub(crate) struct Tofino { pub abort_reported: bool, pub ready_for_power_up: bool, pub pcie_link_up: bool, + pub pcie_reset_asserted: bool, } impl Tofino { @@ -27,6 +28,7 @@ impl Tofino { abort_reported: false, ready_for_power_up: false, pcie_link_up: false, + pcie_reset_asserted: true, } } @@ -77,6 +79,21 @@ impl Tofino { == 0xf) } + /// Poll FPGA for status of the PCIe reset signal from the host. Note that the + /// logic level of the reset signal has been normalized in the FPGA, so asserted + /// (logic low on PERST_L) will be true and deasserted (logic high) will be false). + pub fn poll_pcie_reset(&mut self) -> Result { + let reset_asserted = self + .sequencer + .is_pcie_reset() + .map_err(|_| SeqError::FpgaError)?; + if reset_asserted != self.pcie_reset_asserted { + ringbuf_entry!(Trace::TofinoPcieReset(reset_asserted)); + } + self.pcie_reset_asserted = reset_asserted; + Ok(self.pcie_reset_asserted) + } + pub fn power_up(&mut self) -> Result<(), SeqError> { ringbuf_entry!(Trace::TofinoPowerUp); From d487eb6fcbac3fb1692f7fa4e489428f1565174d Mon Sep 17 00:00:00 2001 From: Aaron-Hartwig Date: Wed, 4 Mar 2026 14:17:56 -0600 Subject: [PATCH 2/4] feedback from Laura --- drv/sidecar-seq-server/src/main.rs | 13 +++++++++---- drv/sidecar-seq-server/src/tofino.rs | 4 ++-- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/drv/sidecar-seq-server/src/main.rs b/drv/sidecar-seq-server/src/main.rs index 9ffc73e8d6..c7a4f02129 100644 --- a/drv/sidecar-seq-server/src/main.rs +++ b/drv/sidecar-seq-server/src/main.rs @@ -122,6 +122,7 @@ enum Trace { ringbuf!(Trace, 32, Trace::None); const TIMER_INTERVAL: u64 = 1000; +const NO_PCIE_LIMIT: u8 = 30; // QSFP_2_SP_A2_PG const POWER_GOOD: sys_api::PinSet = sys_api::Port::F.pin(12); @@ -172,8 +173,10 @@ struct ServerImpl { led_blink_on: bool, sys: sys_api::Sys, // used to track how many notification loops elapsed while in A0 without a - // PCIe link + // PCIe link. This will be capped at NO_PCIE_LIMIT loops and should not + // overflow. no_pcie_count: u8, + // keep track if we've resequenced since we only want to try it once resequenced: bool, } @@ -398,11 +401,10 @@ impl ServerImpl { .is_pcie_reset() .map_err(|_| SeqError::FpgaError)? && !self.tofino.pcie_link_up()? - && !self.resequenced { self.no_pcie_count += 1; - if self.no_pcie_count >= 30 { + if self.no_pcie_count >= NO_PCIE_LIMIT { // We have failed to establish a link, resequence. ringbuf_entry!(Trace::TofinoResequence); self.tofino.power_down()?; @@ -864,9 +866,12 @@ impl NotificationHandler for ServerImpl { self.monitor_fan_modules(); // Monitor Tofino PCIe Link - self.tofino.poll_pcie_reset(); + if let Err(e) = self.tofino.poll_pcie_reset() { + ringbuf_entry!(Trace::TofinoSequencerError(e)); + } if self.tofino.sequencer.state().unwrap_or(TofinoSeqState::A2) == TofinoSeqState::A0 + && !self.resequenced { if let Err(e) = self.monitor_tofino_pcie_link() { ringbuf_entry!(Trace::TofinoSequencerError(e)); diff --git a/drv/sidecar-seq-server/src/tofino.rs b/drv/sidecar-seq-server/src/tofino.rs index 16293c7cee..cb1d906d90 100644 --- a/drv/sidecar-seq-server/src/tofino.rs +++ b/drv/sidecar-seq-server/src/tofino.rs @@ -82,7 +82,7 @@ impl Tofino { /// Poll FPGA for status of the PCIe reset signal from the host. Note that the /// logic level of the reset signal has been normalized in the FPGA, so asserted /// (logic low on PERST_L) will be true and deasserted (logic high) will be false). - pub fn poll_pcie_reset(&mut self) -> Result { + pub fn poll_pcie_reset(&mut self) -> Result<(), SeqError> { let reset_asserted = self .sequencer .is_pcie_reset() @@ -91,7 +91,7 @@ impl Tofino { ringbuf_entry!(Trace::TofinoPcieReset(reset_asserted)); } self.pcie_reset_asserted = reset_asserted; - Ok(self.pcie_reset_asserted) + Ok(()) } pub fn power_up(&mut self) -> Result<(), SeqError> { From 1600641cebb8b5da1974a8fa1e0e7aefd3e56f84 Mon Sep 17 00:00:00 2001 From: Aaron-Hartwig Date: Fri, 6 Mar 2026 10:23:53 -0600 Subject: [PATCH 3/4] Add comments about resequencing once --- drv/sidecar-seq-server/src/main.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drv/sidecar-seq-server/src/main.rs b/drv/sidecar-seq-server/src/main.rs index c7a4f02129..db3885dffc 100644 --- a/drv/sidecar-seq-server/src/main.rs +++ b/drv/sidecar-seq-server/src/main.rs @@ -409,6 +409,7 @@ impl ServerImpl { ringbuf_entry!(Trace::TofinoResequence); self.tofino.power_down()?; self.tofino.power_up()?; + // The current intention is to only do this once. self.resequenced = true; } } else { @@ -869,6 +870,8 @@ impl NotificationHandler for ServerImpl { if let Err(e) = self.tofino.poll_pcie_reset() { ringbuf_entry!(Trace::TofinoSequencerError(e)); } + // Only monitor the PCIe link if we expect one to be there (i.e., we are in A0). + // Currently, we will only resequence a single time to resolve the problem. if self.tofino.sequencer.state().unwrap_or(TofinoSeqState::A2) == TofinoSeqState::A0 && !self.resequenced From 3d0f00417d1d9d5f7174017f8d2e7f558e77b0d9 Mon Sep 17 00:00:00 2001 From: Aaron-Hartwig Date: Fri, 6 Mar 2026 10:44:55 -0600 Subject: [PATCH 4/4] Adjust PCIe timeout from 30s->120s --- drv/sidecar-seq-server/src/main.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drv/sidecar-seq-server/src/main.rs b/drv/sidecar-seq-server/src/main.rs index db3885dffc..90626ba412 100644 --- a/drv/sidecar-seq-server/src/main.rs +++ b/drv/sidecar-seq-server/src/main.rs @@ -122,7 +122,7 @@ enum Trace { ringbuf!(Trace, 32, Trace::None); const TIMER_INTERVAL: u64 = 1000; -const NO_PCIE_LIMIT: u8 = 30; +const NO_PCIE_LIMIT: u8 = 120; // QSFP_2_SP_A2_PG const POWER_GOOD: sys_api::PinSet = sys_api::Port::F.pin(12);