From 34fe26dcdd963c01dbca532df488f96a388b45e0 Mon Sep 17 00:00:00 2001 From: Vinod Chitrali Date: Thu, 21 May 2026 22:29:20 +0000 Subject: [PATCH] feat(api): poll RMS power state for PowerShelf in Ready While a PowerShelf idles in `Ready`, perform a best-effort `GetPowerStateByDeviceList` RPC against RMS and persist the observed `pstate` to `power_shelves.status`. The lookup mirrors the `NodeSet`-with-inline-BMC-endpoint shape used by the `Maintenance` handler's `SetPowerStateByDeviceList` path, so `build_power_shelf_node_info` is bumped to `pub(super)` for reuse. Missing prerequisites (no RMS client, no rack association, no BMC details, no credentials) and transport / status failures are logged but never transition the controller out of `Ready`, so a transient RMS outage cannot bounce the shelf into `Error`. Also bumps `librms` from `v0.0.12-rc1` to `v0.0.12-rc4` (adds the `prost-types` dependency and the new `GetPowerStateByDeviceList`, firmware-object management, and `SetScaleUpFabricState` RPCs), and updates the in-tree mock `RmsApi` implementations to cover the new trait methods. Signed-off-by: Vinod Chitrali --- Cargo.lock | 5 +- Cargo.toml | 2 +- crates/api-test-helper/src/mock_rms.rs | 82 +++++++ crates/api/src/rack/rms_client.rs | 72 ++++++ .../power_shelf/maintenance.rs | 11 +- .../src/state_controller/power_shelf/ready.rs | 220 +++++++++++++++++- 6 files changed, 377 insertions(+), 15 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4736395219..0269b1469c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6211,7 +6211,7 @@ dependencies = [ [[package]] name = "librms" version = "0.0.12" -source = "git+https://github.com/NVIDIA/nv-rms-client.git?tag=v0.0.12-rc1#94a7d944e80075cd57d04bdf0ebf2db497e205db" +source = "git+https://github.com/NVIDIA/nv-rms-client.git?tag=v0.0.12-rc4#a870c06ee1483a46ef3fb01cc4ce7d73470c7d3c" dependencies = [ "async-trait", "chrono", @@ -6221,6 +6221,7 @@ dependencies = [ "hyper-timeout", "hyper-util", "prost", + "prost-types", "rustls", "rustls-pemfile", "serde", @@ -11238,7 +11239,7 @@ dependencies = [ [[package]] name = "tonic-client-wrapper" version = "1.0.0" -source = "git+https://github.com/NVIDIA/nv-rms-client.git?tag=v0.0.12-rc1#94a7d944e80075cd57d04bdf0ebf2db497e205db" +source = "git+https://github.com/NVIDIA/nv-rms-client.git?tag=v0.0.12-rc4#a870c06ee1483a46ef3fb01cc4ce7d73470c7d3c" dependencies = [ "async-trait", "heck", diff --git a/Cargo.toml b/Cargo.toml index cd38a73119..46d02072b7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -27,7 +27,7 @@ authors = ["NVIDIA Carbide Engineering "] [workspace.dependencies] clap = { version = "4", features = ["derive", "env"] } libredfish = { git = "https://github.com/NVIDIA/libredfish.git", tag = "v0.44.2" } -librms = { git = "https://github.com/NVIDIA/nv-rms-client.git", tag = "v0.0.12-rc1" } +librms = { git = "https://github.com/NVIDIA/nv-rms-client.git", tag = "v0.0.12-rc4" } ansi-to-html = "0.2.2" tokio = { version = "1", features = ["full", "tracing"] } diff --git a/crates/api-test-helper/src/mock_rms.rs b/crates/api-test-helper/src/mock_rms.rs index 65f0d71fee..277820996e 100644 --- a/crates/api-test-helper/src/mock_rms.rs +++ b/crates/api-test-helper/src/mock_rms.rs @@ -63,6 +63,10 @@ pub struct MockRmsApi { Mutex>>, set_power_state_by_device_list_calls: Mutex>, + get_power_state_by_device_list_responses: + Mutex>>, + get_power_state_by_device_list_calls: Mutex>, + get_power_state_responses: Mutex>>, get_power_state_calls: Mutex>, @@ -201,6 +205,8 @@ impl MockRmsApi { set_power_state_calls: Default::default(), set_power_state_by_device_list_responses: Default::default(), set_power_state_by_device_list_calls: Default::default(), + get_power_state_by_device_list_responses: Default::default(), + get_power_state_by_device_list_calls: Default::default(), get_power_state_responses: Default::default(), get_power_state_calls: Default::default(), sequence_rack_power_responses: Default::default(), @@ -673,12 +679,88 @@ impl RmsApi for MockRmsApi { .push(cmd); pop_or_err(&mut self.set_power_state_by_device_list_responses.lock().await) } + async fn get_power_state_by_device_list( + &self, + cmd: rms::GetPowerStateByDeviceListRequest, + ) -> Result { + self.get_power_state_by_device_list_calls + .lock() + .await + .push(cmd); + pop_or_err(&mut self.get_power_state_by_device_list_responses.lock().await) + } async fn update_switch_system_password( &self, _cmd: rms::UpdateSwitchSystemPasswordRequest, ) -> Result { Ok(rms::UpdateSwitchSystemPasswordResponse::default()) } + async fn add_firmware_object( + &self, + _cmd: rms::AddFirmwareObjectRequest, + ) -> Result { + Ok(rms::FirmwareObject::default()) + } + async fn get_firmware_object( + &self, + _cmd: rms::GetFirmwareObjectRequest, + ) -> Result { + Ok(rms::FirmwareObject::default()) + } + async fn list_firmware_objects( + &self, + _cmd: rms::ListFirmwareObjectsRequest, + ) -> Result { + Ok(rms::ListFirmwareObjectsResponse::default()) + } + async fn delete_firmware_object( + &self, + _cmd: rms::DeleteFirmwareObjectRequest, + ) -> Result { + Ok(rms::OperationResponse::default()) + } + async fn set_default_firmware_object( + &self, + _cmd: rms::SetDefaultFirmwareObjectRequest, + ) -> Result { + Ok(rms::FirmwareObject::default()) + } + async fn apply_firmware_object( + &self, + _cmd: rms::ApplyFirmwareObjectRequest, + ) -> Result { + Ok(rms::ApplyFirmwareObjectResponse::default()) + } + async fn apply_firmware_object_from_json( + &self, + _cmd: rms::ApplyFirmwareObjectFromJsonRequest, + ) -> Result { + Ok(rms::ApplyFirmwareObjectResponse::default()) + } + async fn apply_switch_system_image( + &self, + _cmd: rms::ApplySwitchSystemImageRequest, + ) -> Result { + Ok(rms::ApplySwitchSystemImageResponse::default()) + } + async fn apply_switch_system_image_from_json( + &self, + _cmd: rms::ApplySwitchSystemImageFromJsonRequest, + ) -> Result { + Ok(rms::ApplySwitchSystemImageResponse::default()) + } + async fn get_firmware_object_history( + &self, + _cmd: rms::GetFirmwareObjectHistoryRequest, + ) -> Result { + Ok(rms::GetFirmwareObjectHistoryResponse::default()) + } + async fn set_scale_up_fabric_state( + &self, + _cmd: rms::SetScaleUpFabricStateRequest, + ) -> Result { + Ok(rms::SetScaleUpFabricStateResponse::default()) + } async fn get_power_state( &self, cmd: rms::GetPowerStateRequest, diff --git a/crates/api/src/rack/rms_client.rs b/crates/api/src/rack/rms_client.rs index dbc82c81d7..24c0176160 100644 --- a/crates/api/src/rack/rms_client.rs +++ b/crates/api/src/rack/rms_client.rs @@ -340,6 +340,78 @@ pub mod test_support { ) -> Result { Ok(rms::GetPowerStateResponse::default()) } + async fn get_power_state_by_device_list( + &self, + _cmd: rms::GetPowerStateByDeviceListRequest, + ) -> Result { + Ok(rms::GetPowerStateByDeviceListResponse::default()) + } + async fn add_firmware_object( + &self, + _cmd: rms::AddFirmwareObjectRequest, + ) -> Result { + Ok(rms::FirmwareObject::default()) + } + async fn get_firmware_object( + &self, + _cmd: rms::GetFirmwareObjectRequest, + ) -> Result { + Ok(rms::FirmwareObject::default()) + } + async fn list_firmware_objects( + &self, + _cmd: rms::ListFirmwareObjectsRequest, + ) -> Result { + Ok(rms::ListFirmwareObjectsResponse::default()) + } + async fn delete_firmware_object( + &self, + _cmd: rms::DeleteFirmwareObjectRequest, + ) -> Result { + Ok(rms::OperationResponse::default()) + } + async fn set_default_firmware_object( + &self, + _cmd: rms::SetDefaultFirmwareObjectRequest, + ) -> Result { + Ok(rms::FirmwareObject::default()) + } + async fn apply_firmware_object( + &self, + _cmd: rms::ApplyFirmwareObjectRequest, + ) -> Result { + Ok(rms::ApplyFirmwareObjectResponse::default()) + } + async fn apply_firmware_object_from_json( + &self, + _cmd: rms::ApplyFirmwareObjectFromJsonRequest, + ) -> Result { + Ok(rms::ApplyFirmwareObjectResponse::default()) + } + async fn apply_switch_system_image( + &self, + _cmd: rms::ApplySwitchSystemImageRequest, + ) -> Result { + Ok(rms::ApplySwitchSystemImageResponse::default()) + } + async fn apply_switch_system_image_from_json( + &self, + _cmd: rms::ApplySwitchSystemImageFromJsonRequest, + ) -> Result { + Ok(rms::ApplySwitchSystemImageResponse::default()) + } + async fn get_firmware_object_history( + &self, + _cmd: rms::GetFirmwareObjectHistoryRequest, + ) -> Result { + Ok(rms::GetFirmwareObjectHistoryResponse::default()) + } + async fn set_scale_up_fabric_state( + &self, + _cmd: rms::SetScaleUpFabricStateRequest, + ) -> Result { + Ok(rms::SetScaleUpFabricStateResponse::default()) + } async fn sequence_rack_power( &self, _cmd: rms::SequenceRackPowerRequest, diff --git a/crates/api/src/state_controller/power_shelf/maintenance.rs b/crates/api/src/state_controller/power_shelf/maintenance.rs index c13bfc2567..6fb5217b4a 100644 --- a/crates/api/src/state_controller/power_shelf/maintenance.rs +++ b/crates/api/src/state_controller/power_shelf/maintenance.rs @@ -245,11 +245,12 @@ async fn invoke_rms_power_operation( } /// Build the `rms::NewNodeInfo` describing this power shelf for inclusion -/// in a `SetPowerStateByDeviceList` request. Resolves the BMC IP from the -/// database and BMC credentials via the credential manager, since the -/// caller-supplied variant of the RPC requires the BMC connection details -/// inline rather than relying on RMS's inventory. -async fn build_power_shelf_node_info( +/// in any caller-supplied `NodeSet` request (`SetPowerStateByDeviceList` +/// from `Maintenance`, `GetDeviceInfoByDeviceList` from `Ready`). Resolves +/// the BMC IP from the database and BMC credentials via the credential +/// manager, since these RPCs require the BMC connection details inline +/// rather than relying on RMS's inventory. +pub(super) async fn build_power_shelf_node_info( power_shelf_id: &PowerShelfId, state: &PowerShelf, rack_id: String, diff --git a/crates/api/src/state_controller/power_shelf/ready.rs b/crates/api/src/state_controller/power_shelf/ready.rs index 673735a1d8..36ff01c941 100644 --- a/crates/api/src/state_controller/power_shelf/ready.rs +++ b/crates/api/src/state_controller/power_shelf/ready.rs @@ -18,9 +18,14 @@ //! Handler for PowerShelfControllerState::Ready. use carbide_uuid::power_shelf::PowerShelfId; -use model::power_shelf::{PowerShelf, PowerShelfControllerState}; +use db::power_shelf as db_power_shelf; +use librms::protos::rack_manager as rms; +use model::power_shelf::{PowerShelf, PowerShelfControllerState, PowerShelfStatus}; +use sqlx::PgTransaction; +use crate::state_controller::external_service_error::rack_manager_error; use crate::state_controller::power_shelf::context::PowerShelfStateHandlerContextObjects; +use crate::state_controller::power_shelf::maintenance::build_power_shelf_node_info; use crate::state_controller::state_handler::{ StateHandlerContext, StateHandlerError, StateHandlerOutcome, }; @@ -30,14 +35,15 @@ use crate::state_controller::state_handler::{ /// If the power shelf is marked for deletion, transitions to `Deleting`. /// If a maintenance request has been posted via /// `power_shelf_maintenance_requested`, transitions to `Maintenance` with the -/// requested operation (PowerOn / PowerOff). Otherwise idles. +/// requested operation (PowerOn / PowerOff). Otherwise polls RMS for the +/// current power state (best-effort observation) and idles. /// /// TODO: Implement PowerShelf monitoring (health checks, status updates, /// power consumption / efficiency tracking). pub async fn handle_ready( power_shelf_id: &PowerShelfId, state: &mut PowerShelf, - _ctx: &mut StateHandlerContext<'_, PowerShelfStateHandlerContextObjects>, + ctx: &mut StateHandlerContext<'_, PowerShelfStateHandlerContextObjects>, ) -> Result, StateHandlerError> { if state.is_marked_as_deleted() { return Ok(StateHandlerOutcome::transition( @@ -58,9 +64,209 @@ pub async fn handle_ready( )); } + let txn = poll_rms_power_state(power_shelf_id, state, ctx).await; + tracing::info!("PowerShelf {} is ready", power_shelf_id,); - Ok(StateHandlerOutcome::wait(format!( - "PowerShelf {} is ready", - power_shelf_id - ))) + Ok( + StateHandlerOutcome::wait(format!("PowerShelf {} is ready", power_shelf_id)) + .with_txn_opt(txn), + ) +} + +/// Best-effort RMS `GetPowerStateByDeviceList` poll while the power shelf +/// idles in `Ready`. +/// +/// Mirrors the `NodeSet`-with-inline-BMC-endpoint shape used by the +/// `Maintenance` handler's `SetPowerStateByDeviceList` path: RMS's +/// `*ByDeviceList` RPCs take caller-supplied BMC connection details and do +/// not consult RMS's inventory. The lookup is purely observational, so +/// missing prerequisites (no RMS client, no rack association, no BMC +/// MAC/IP, no credentials) and transport / status failures are logged but +/// never cause a transition out of `Ready` — a transient RMS outage must +/// not bounce the controller into `Error`. +/// +/// On a successful response, the observed `pstate` for this power shelf is +/// persisted to the `power_shelves.status` column and the in-memory `state` +/// is updated to match. The returned `PgTransaction` (if any) carries that +/// status write so the caller can attach it to the `Ready` outcome and have +/// the state-controller framework commit it alongside the usual outcome +/// bookkeeping. +async fn poll_rms_power_state( + power_shelf_id: &PowerShelfId, + state: &mut PowerShelf, + ctx: &mut StateHandlerContext<'_, PowerShelfStateHandlerContextObjects>, +) -> Option> { + let Some(rms_client) = ctx.services.rms_client.as_ref() else { + tracing::debug!( + power_shelf_id = %power_shelf_id, + "PowerShelf Ready: skipping RMS GetPowerStateByDeviceList; RMS client not configured", + ); + return None; + }; + + let Some(rack_id) = state.rack_id.as_ref() else { + tracing::debug!( + power_shelf_id = %power_shelf_id, + "PowerShelf Ready: skipping RMS GetPowerStateByDeviceList; power shelf has no rack association", + ); + return None; + }; + + let device = match build_power_shelf_node_info( + power_shelf_id, + state, + rack_id.to_string(), + &ctx.services.db_pool, + ctx.services.credential_manager.as_ref(), + ) + .await + { + Ok(device) => device, + Err(cause) => { + tracing::debug!( + power_shelf_id = %power_shelf_id, + rack_id = %rack_id, + cause = %cause, + "PowerShelf Ready: skipping RMS GetPowerStateByDeviceList; unable to build NodeSet", + ); + return None; + } + }; + + let request = rms::GetPowerStateByDeviceListRequest { + nodes: Some(rms::NodeSet { + devices: vec![device], + }), + ..Default::default() + }; + + let rack_id_str = rack_id.to_string(); + let response = match rms_client.get_power_state_by_device_list(request).await { + Ok(response) => response, + Err(error) => { + let error = rack_manager_error("get_power_state_by_device_list", error); + tracing::warn!( + power_shelf_id = %power_shelf_id, + rack_id = %rack_id_str, + error = %error, + "RMS GetPowerStateByDeviceList transport error", + ); + return None; + } + }; + + let batch = response.response.clone().unwrap_or_default(); + if !(batch.status == rms::ReturnCode::Success as i32 && batch.failed_nodes == 0) { + tracing::warn!( + power_shelf_id = %power_shelf_id, + rack_id = %rack_id_str, + batch_status = batch.status, + successful_nodes = batch.successful_nodes, + failed_nodes = batch.failed_nodes, + message = %batch.message, + "RMS GetPowerStateByDeviceList returned non-Success result", + ); + return None; + } + + tracing::info!( + power_shelf_id = %power_shelf_id, + rack_id = %rack_id_str, + successful_nodes = batch.successful_nodes, + pstates = ?response + .node_power_states + .iter() + .map(|node| (node.node_id.as_str(), node.pstate.as_str())) + .collect::>(), + "RMS GetPowerStateByDeviceList succeeded", + ); + + persist_observed_power_state(power_shelf_id, state, ctx, &response.node_power_states).await +} + +/// Look up the `NodePowerState` for this power shelf in the RMS response, +/// stamp the value into `state.status`, and persist it via +/// `db_power_shelf::update`. Returns the open `PgTransaction` so the caller +/// can attach it to the `Ready` outcome. +/// +/// Status persistence is best-effort: if RMS did not echo a result for this +/// node, or if the DB write fails, the in-memory state is left untouched +/// and `None` is returned — `Ready` must stay in `Ready` regardless. +async fn persist_observed_power_state( + power_shelf_id: &PowerShelfId, + state: &mut PowerShelf, + ctx: &mut StateHandlerContext<'_, PowerShelfStateHandlerContextObjects>, + node_power_states: &[rms::NodePowerState], +) -> Option> { + let node_id = power_shelf_id.to_string(); + let Some(observed) = node_power_states + .iter() + .find(|node| node.node_id == node_id) + else { + tracing::debug!( + power_shelf_id = %power_shelf_id, + "RMS GetPowerStateByDeviceList: no NodePowerState echoed for this power shelf; skipping status update", + ); + return None; + }; + + let new_power_state = observed.pstate.to_lowercase(); + let new_status = match state.status.as_ref() { + Some(existing) => PowerShelfStatus { + shelf_name: existing.shelf_name.clone(), + power_state: new_power_state.clone(), + health_status: existing.health_status.clone(), + }, + None => PowerShelfStatus { + shelf_name: state.config.name.clone(), + power_state: new_power_state.clone(), + health_status: String::new(), + }, + }; + + if state + .status + .as_ref() + .is_some_and(|s| s.power_state == new_status.power_state) + { + tracing::debug!( + power_shelf_id = %power_shelf_id, + power_state = %new_status.power_state, + "PowerShelf status power_state unchanged; skipping DB write", + ); + return None; + } + + let previous_status = state.status.replace(new_status); + + let mut txn = match ctx.services.db_pool.begin().await { + Ok(txn) => txn, + Err(error) => { + state.status = previous_status; + tracing::warn!( + power_shelf_id = %power_shelf_id, + error = %error, + "PowerShelf Ready: failed to begin txn while persisting observed power state", + ); + return None; + } + }; + + if let Err(error) = db_power_shelf::update(state, &mut txn).await { + state.status = previous_status; + tracing::warn!( + power_shelf_id = %power_shelf_id, + error = %error, + "PowerShelf Ready: failed to persist observed power state to DB", + ); + return None; + } + + tracing::info!( + power_shelf_id = %power_shelf_id, + power_state = %new_power_state, + "PowerShelf Ready: persisted observed power state from RMS", + ); + + Some(txn) }