Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions drv/sidecar-mainboard-controller/src/tofino2.rs
Original file line number Diff line number Diff line change
Expand Up @@ -432,6 +432,11 @@ impl Sequencer {
pub fn pcie_hotplug_status(&self) -> Result<u8, FpgaError> {
self.fpga.read(Addr::PCIE_HOTPLUG_STATUS)
}

pub fn is_pcie_reset(&self) -> Result<bool, FpgaError> {
let status: u8 = self.pcie_hotplug_status()?;
Ok(status & Reg::PCIE_HOTPLUG_STATUS::HOST_RESET != 0)
}
}

bitfield! {
Expand Down
58 changes: 58 additions & 0 deletions drv/sidecar-seq-server/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,8 @@ enum Trace {
TofinoCfgRegisterValue(TofinoCfgRegisters, u32),
TofinoPowerUp,
TofinoPowerDown,
TofinoResequence,
TofinoPcieReset(bool),
SetVddCoreVout(userlib::units::Volts),
SetPCIePresent,
ClearPCIePresent,
Expand Down Expand Up @@ -120,6 +122,7 @@ enum Trace {
ringbuf!(Trace, 32, Trace::None);

const TIMER_INTERVAL: u64 = 1000;
const NO_PCIE_LIMIT: u8 = 120;

// QSFP_2_SP_A2_PG
const POWER_GOOD: sys_api::PinSet = sys_api::Port::F.pin(12);
Expand Down Expand Up @@ -169,6 +172,12 @@ struct ServerImpl {
// a piece of state to allow blinking LEDs to be in phase
led_blink_on: bool,
sys: sys_api::Sys,
// used to track how many notification loops elapsed while in A0 without a
// PCIe link. This will be capped at NO_PCIE_LIMIT loops and should not
// overflow.
no_pcie_count: u8,
// keep track if we've resequenced since we only want to try it once
resequenced: bool,
}

impl ServerImpl {
Expand Down Expand Up @@ -376,6 +385,38 @@ impl ServerImpl {
Err(e) => Err(e),
}
}

// Monitor the status of Tofino's PCIe link to the host. When a host is
// connected and has booted, PERST will be deasserted and link will train.
// This worked reliably on Gimlet, but for reasons we've not yet been able
// to identify it can fail on Cosmo (host thinks things are fine, Tofino
// does not). When this happens, resequencing the Tofino reliably
// establishes the link. So we will monitor if we think there should be
// a PCIe link or not, and if we think there should be one but the Tofino
// thinks it is down we will resequence the Tofino after some delay.
fn monitor_tofino_pcie_link(&mut self) -> Result<(), SeqError> {
if !self
.tofino
.sequencer
.is_pcie_reset()
.map_err(|_| SeqError::FpgaError)?
&& !self.tofino.pcie_link_up()?
{
self.no_pcie_count += 1;

if self.no_pcie_count >= NO_PCIE_LIMIT {
// We have failed to establish a link, resequence.
ringbuf_entry!(Trace::TofinoResequence);
self.tofino.power_down()?;
self.tofino.power_up()?;
// The current intention is to only do this once.
self.resequenced = true;
}
} else {
self.no_pcie_count = 0;
}
Ok(())
}
}

impl idl::InOrderSequencerImpl for ServerImpl {
Expand Down Expand Up @@ -825,6 +866,21 @@ impl NotificationHandler for ServerImpl {
// Fan module monitoring pulled out to keep this loop readable
self.monitor_fan_modules();

// Monitor Tofino PCIe Link
if let Err(e) = self.tofino.poll_pcie_reset() {
ringbuf_entry!(Trace::TofinoSequencerError(e));
}
// Only monitor the PCIe link if we expect one to be there (i.e., we are in A0).
// Currently, we will only resequence a single time to resolve the problem.
if self.tofino.sequencer.state().unwrap_or(TofinoSeqState::A2)
== TofinoSeqState::A0
&& !self.resequenced
{
if let Err(e) = self.monitor_tofino_pcie_link() {
ringbuf_entry!(Trace::TofinoSequencerError(e));
}
}

let finish = sys_get_timer().now;

// We now know when we were notified and when any work was completed.
Expand Down Expand Up @@ -911,6 +967,8 @@ fn main() -> ! {
fan_modules,
led_blink_on: false,
sys,
no_pcie_count: 0,
resequenced: false,
};

ringbuf_entry!(Trace::FpgaInit);
Expand Down
17 changes: 17 additions & 0 deletions drv/sidecar-seq-server/src/tofino.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ pub(crate) struct Tofino {
pub abort_reported: bool,
pub ready_for_power_up: bool,
pub pcie_link_up: bool,
pub pcie_reset_asserted: bool,
}

impl Tofino {
Expand All @@ -27,6 +28,7 @@ impl Tofino {
abort_reported: false,
ready_for_power_up: false,
pcie_link_up: false,
pcie_reset_asserted: true,
}
}

Expand Down Expand Up @@ -77,6 +79,21 @@ impl Tofino {
== 0xf)
}

/// Poll FPGA for status of the PCIe reset signal from the host. Note that the
/// logic level of the reset signal has been normalized in the FPGA, so asserted
/// (logic low on PERST_L) will be true and deasserted (logic high) will be false).
pub fn poll_pcie_reset(&mut self) -> Result<(), SeqError> {
let reset_asserted = self
.sequencer
.is_pcie_reset()
.map_err(|_| SeqError::FpgaError)?;
if reset_asserted != self.pcie_reset_asserted {
ringbuf_entry!(Trace::TofinoPcieReset(reset_asserted));
}
self.pcie_reset_asserted = reset_asserted;
Ok(())
}

pub fn power_up(&mut self) -> Result<(), SeqError> {
ringbuf_entry!(Trace::TofinoPowerUp);

Expand Down