diff --git a/crates/admin-cli/src/cfg/cli_options.rs b/crates/admin-cli/src/cfg/cli_options.rs index 178371b273..ca2fabcd6b 100644 --- a/crates/admin-cli/src/cfg/cli_options.rs +++ b/crates/admin-cli/src/cfg/cli_options.rs @@ -23,7 +23,7 @@ use crate::{ expected_rack, expected_switch, extension_service, firmware, generate_shell_complete, host, ib_partition, instance, instance_type, inventory, ip, ipxe_template, jump, machine, machine_interfaces, machine_validation, managed_host, managed_switch, mlx, network_devices, - network_security_group, network_segment, nvl_logical_partition, nvl_partition, + network_security_group, network_segment, nvl_domain, nvl_logical_partition, nvl_partition, nvlink_nmxc_endpoints, operating_system, os_image, ping, power_shelf, rack, redfish, resource_pool, rms, route_server, scout_stream, set, site_explorer, sku, ssh, switch, tenant, tenant_keyset, tpm_ca, trim_table, version, vpc, vpc_peering, vpc_prefix, @@ -337,6 +337,13 @@ pub enum CliCommand { )] NvlPartition(nvl_partition::Cmd), + #[clap( + about = "NVLink domain related handling", + subcommand, + visible_alias = "nvd" + )] + NvlDomain(nvl_domain::Cmd), + #[clap( about = "Logical partition related handling", subcommand, @@ -347,9 +354,10 @@ pub enum CliCommand { #[clap(subcommand)] #[clap(verbatim_doc_comment)] /// DPF-related commands. - /// Note: These commands update the DPF state of the machine, which determines DPF-based DPU re-provisioning. - /// The state is saved in the machine's metadata and will be deleted if the machine is force-deleted. - /// To make the state persistent, add the DPF state for a machine (host) to the expected machines table. + /// Note: These commands update the DPF state of the machine, which determines DPF-based DPU + /// re-provisioning. The state is saved in the machine's metadata and will be deleted if the + /// machine is force-deleted. To make the state persistent, add the DPF state for a machine + /// (host) to the expected machines table. Dpf(crate::dpf::Cmd), #[clap(about = "Tenant management", subcommand, visible_alias = "tm")] diff --git a/crates/admin-cli/src/main.rs b/crates/admin-cli/src/main.rs index c035143c4b..fd37bca562 100644 --- a/crates/admin-cli/src/main.rs +++ b/crates/admin-cli/src/main.rs @@ -85,6 +85,7 @@ mod mlx; mod network_devices; mod network_security_group; mod network_segment; +mod nvl_domain; mod nvl_logical_partition; mod nvl_partition; mod nvlink_nmxc_endpoints; @@ -248,6 +249,7 @@ async fn main() -> color_eyre::Result<()> { CliCommand::NetworkSecurityGroup(cmd) => cmd.dispatch(ctx).await?, CliCommand::NetworkSegment(cmd) => cmd.dispatch(ctx).await?, CliCommand::NvlinkNmxcEndpoints(cmd) => cmd.dispatch(ctx).await?, + CliCommand::NvlDomain(cmd) => cmd.dispatch(ctx).await?, CliCommand::NvlPartition(cmd) => cmd.dispatch(ctx).await?, CliCommand::IpxeTemplate(cmd) => cmd.dispatch(ctx).await?, CliCommand::OsImage(cmd) => cmd.dispatch(ctx).await?, diff --git a/crates/admin-cli/src/nvl_domain/health_report/args.rs b/crates/admin-cli/src/nvl_domain/health_report/args.rs new file mode 100644 index 0000000000..02b0ba9698 --- /dev/null +++ b/crates/admin-cli/src/nvl_domain/health_report/args.rs @@ -0,0 +1,32 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +use carbide_uuid::nvlink::NvLinkDomainId; +use clap::Parser; + +#[derive(Parser, Debug)] +pub enum Args { + #[clap(about = "List health report sources for an NVLink domain")] + Show { domain_id: NvLinkDomainId }, + #[clap(about = "Print an empty health report template")] + PrintEmptyTemplate, + #[clap(about = "Remove a health report source from an NVLink domain")] + Remove { + domain_id: NvLinkDomainId, + report_source: String, + }, +} diff --git a/crates/admin-cli/src/nvl_domain/health_report/cmd.rs b/crates/admin-cli/src/nvl_domain/health_report/cmd.rs new file mode 100644 index 0000000000..9505d4f06f --- /dev/null +++ b/crates/admin-cli/src/nvl_domain/health_report/cmd.rs @@ -0,0 +1,206 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +use ::rpc::admin_cli::OutputFormat; +use ::rpc::forge::{ + self as forgerpc, ListNvLinkDomainHealthReportsRequest, RemoveNvLinkDomainHealthReportRequest, +}; +use ::rpc::health::HealthReport; +use chrono::{DateTime, SecondsFormat, Utc}; +use prettytable::{Table, row}; + +use super::args::Args; +use crate::errors::{CarbideCliError, CarbideCliResult}; +use crate::health_utils; +use crate::rpc::ApiClient; + +const MESSAGE_WRAP_WIDTH: usize = 60; +const TARGET_WRAP_WIDTH: usize = 36; + +/// Handles NVLink domain health-report CLI subcommands. +pub async fn handle_health_report( + command: Args, + output_format: OutputFormat, + api_client: &ApiClient, +) -> CarbideCliResult<()> { + match command { + Args::Show { domain_id } => { + let response = api_client + .0 + .list_nv_link_domain_health_reports(ListNvLinkDomainHealthReportsRequest { + domain_id: Some(domain_id), + }) + .await?; + + display_health_reports(response.health_report_entries, output_format)?; + } + Args::Remove { + domain_id, + report_source, + } => { + api_client + .0 + .remove_nv_link_domain_health_report(RemoveNvLinkDomainHealthReportRequest { + domain_id: Some(domain_id), + source: report_source, + }) + .await?; + } + Args::PrintEmptyTemplate => { + health_utils::print_empty_template(); + } + } + + Ok(()) +} + +/// Displays NVLink domain health reports with NVL-specific table formatting. +fn display_health_reports( + entries: Vec, + output_format: OutputFormat, +) -> CarbideCliResult<()> { + // Preserve the existing JSON contract used by other health-report commands. + if output_format == OutputFormat::Json { + return health_utils::display_health_reports(entries, output_format); + } + + let mut rows = Vec::new(); + for entry in entries { + let report = entry.report.ok_or(CarbideCliError::GenericError( + "missing response".to_string(), + ))?; + let mode = match forgerpc::HealthReportApplyMode::try_from(entry.mode) + .map_err(|_| CarbideCliError::GenericError("invalid response".to_string()))? + { + forgerpc::HealthReportApplyMode::Merge => "Merge", + forgerpc::HealthReportApplyMode::Replace => "Replace", + }; + + rows.push((report, mode)); + } + + if rows.is_empty() { + println!("No health report entries found."); + return Ok(()); + } + + println!("Health report entries: {}", rows.len()); + + let mut summary = Table::new(); + summary.set_titles(row!["Source", "Mode", "Observed At", "Alerts"]); + for (report, mode) in &rows { + summary.add_row(row![ + report.source.as_str(), + *mode, + format_timestamp(report.observed_at), + report.alerts.len() + ]); + } + + summary.printstd(); + + for (report, mode) in &rows { + print_alerts(report, mode); + } + + Ok(()) +} + +/// Prints NVLink domain alerts in a compact table layout. +fn print_alerts(report: &HealthReport, mode: &str) { + if report.alerts.is_empty() { + return; + } + + println!(); + println!("Alerts for source {} ({mode})", report.source); + + let mut table = Table::new(); + table.set_titles(row!["Id", "Target", "Since", "Message", "Classifications"]); + for alert in &report.alerts { + let message = format_message(&alert.message); + + table.add_row(row![ + alert.id.as_str(), + wrap_text(alert.target.as_deref().unwrap_or("-"), TARGET_WRAP_WIDTH), + format_timestamp(alert.in_alert_since), + wrap_text(&message, MESSAGE_WRAP_WIDTH), + format_list(&alert.classifications) + ]); + } + + table.printstd(); +} + +/// Formats optional protobuf timestamps for table display. +fn format_timestamp(timestamp: Option) -> String +where + DateTime: TryFrom, +{ + timestamp + .and_then(|timestamp| DateTime::::try_from(timestamp).ok()) + .map(|timestamp| timestamp.to_rfc3339_opts(SecondsFormat::AutoSi, true)) + .unwrap_or_else(|| "-".to_string()) +} + +/// Formats repeated values as one table line per item. +fn format_list(values: &[String]) -> String { + if values.is_empty() { + "-".to_string() + } else { + values.join("\n") + } +} + +/// Pretty-prints structured alert messages when they are JSON. +fn format_message(message: &str) -> String { + if let Ok(value) = serde_json::from_str::(message) { + serde_json::to_string_pretty(&value).unwrap_or_else(|_| message.to_string()) + } else { + message.to_string() + } +} + +/// Hard-wraps text for prettytable cells. +/// +/// The existing MLX wrapper is feature-local and word-based; this output needs +/// to split long IDs and JSON fragments that may not contain whitespace. +fn wrap_text(value: &str, width: usize) -> String { + value + .lines() + .map(|line| wrap_line(line, width)) + .collect::>() + .join("\n") +} + +/// Hard-wraps a single line without changing its content. +fn wrap_line(value: &str, width: usize) -> String { + if width == 0 { + return value.to_string(); + } + + let mut output = String::new(); + for (index, ch) in value.chars().enumerate() { + if index > 0 && index % width == 0 { + output.push('\n'); + } + + output.push(ch); + } + + output +} diff --git a/crates/admin-cli/src/nvl_domain/health_report/mod.rs b/crates/admin-cli/src/nvl_domain/health_report/mod.rs new file mode 100644 index 0000000000..e68b43d969 --- /dev/null +++ b/crates/admin-cli/src/nvl_domain/health_report/mod.rs @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +pub mod args; +pub mod cmd; + +pub use args::Args; + +use crate::cfg::run::Run; +use crate::cfg::runtime::RuntimeContext; +use crate::errors::CarbideCliResult; + +impl Run for Args { + async fn run(self, ctx: &mut RuntimeContext) -> CarbideCliResult<()> { + cmd::handle_health_report(self, ctx.config.format, &ctx.api_client).await + } +} diff --git a/crates/admin-cli/src/nvl_domain/mod.rs b/crates/admin-cli/src/nvl_domain/mod.rs new file mode 100644 index 0000000000..372a40c521 --- /dev/null +++ b/crates/admin-cli/src/nvl_domain/mod.rs @@ -0,0 +1,35 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +pub mod health_report; + +#[cfg(test)] +mod tests; + +use clap::Parser; + +use crate::cfg::dispatch::Dispatch; + +#[derive(Parser, Debug, Dispatch)] +pub enum Cmd { + #[clap( + about = "Manage NVLink domain health report sources", + subcommand, + visible_alias = "hr" + )] + HealthReport(health_report::Args), +} diff --git a/crates/admin-cli/src/nvl_domain/tests.rs b/crates/admin-cli/src/nvl_domain/tests.rs new file mode 100644 index 0000000000..c5da0642e3 --- /dev/null +++ b/crates/admin-cli/src/nvl_domain/tests.rs @@ -0,0 +1,70 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +use clap::{CommandFactory, Parser}; + +use super::health_report::args::Args as HealthReportCommand; +use super::*; + +const TEST_DOMAIN_ID: &str = "00000000-0000-0000-0000-000000000001"; + +fn parse_cmd(args: impl IntoIterator) -> Cmd { + match Cmd::try_parse_from(args) { + Ok(cmd) => cmd, + Err(err) => panic!("failed to parse command: {err}"), + } +} + +#[test] +fn verify_cmd_structure() { + Cmd::command().debug_assert(); +} + +#[test] +fn parse_health_report_show() { + let cmd = parse_cmd(["nvl-domain", "health-report", "show", TEST_DOMAIN_ID]); + + let Cmd::HealthReport(command) = cmd; + if let HealthReportCommand::Show { domain_id } = command { + assert_eq!(domain_id.to_string(), TEST_DOMAIN_ID); + } else { + panic!("expected HealthReport Show variant"); + } +} + +#[test] +fn parse_health_report_remove() { + let cmd = parse_cmd([ + "nvl-domain", + "health-report", + "remove", + TEST_DOMAIN_ID, + "haas-log-analyzer", + ]); + + let Cmd::HealthReport(command) = cmd; + if let HealthReportCommand::Remove { + domain_id, + report_source, + } = command + { + assert_eq!(domain_id.to_string(), TEST_DOMAIN_ID); + assert_eq!(report_source, "haas-log-analyzer"); + } else { + panic!("expected HealthReport Remove variant"); + } +} diff --git a/crates/api-db/migrations/20260519000000_nvlink_domain_health_reports.sql b/crates/api-db/migrations/20260519000000_nvlink_domain_health_reports.sql new file mode 100644 index 0000000000..580552bd46 --- /dev/null +++ b/crates/api-db/migrations/20260519000000_nvlink_domain_health_reports.sql @@ -0,0 +1,4 @@ +CREATE TABLE nvlink_domain_health_reports ( + id uuid PRIMARY KEY, + health_reports jsonb NOT NULL DEFAULT '{"merges": {}}'::jsonb +); diff --git a/crates/api-db/src/health_report.rs b/crates/api-db/src/health_report.rs index d2d05b3a5e..f39f2b9875 100644 --- a/crates/api-db/src/health_report.rs +++ b/crates/api-db/src/health_report.rs @@ -36,6 +36,11 @@ where for<'e> Id: sqlx::Encode<'e, sqlx::Postgres> + sqlx::Type + Sync, { let column_name = "health_reports"; + + // NOTE: SQL injection risk is known here: this helper intentionally preserves the existing + // health-report SQL shape where table_name is an internal constant at call sites, but the JSONB + // path still includes the report source in the SQL text. We'll want to replace this with a + // bound text[] path or central source validation before accepting broader inputs. let path = match mode { HealthReportApplyMode::Merge => format!("merges,\"{}\"", health_report.source), HealthReportApplyMode::Replace => "replace".to_string(), diff --git a/crates/api-db/src/lib.rs b/crates/api-db/src/lib.rs index 26a2d5fa68..f9db8acbcf 100644 --- a/crates/api-db/src/lib.rs +++ b/crates/api-db/src/lib.rs @@ -66,6 +66,7 @@ pub mod network_security_group; pub mod network_segment; pub mod nvl_logical_partition; pub mod nvl_partition; +pub mod nvlink_domain_health_report; pub mod nvlink_nmxc_endpoints; pub mod operating_system; pub mod os_image; diff --git a/crates/api-db/src/nvlink_domain_health_report.rs b/crates/api-db/src/nvlink_domain_health_report.rs new file mode 100644 index 0000000000..dc089ca2bc --- /dev/null +++ b/crates/api-db/src/nvlink_domain_health_report.rs @@ -0,0 +1,95 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +use carbide_uuid::nvlink::NvLinkDomainId; +use health_report::{HealthReport, HealthReportApplyMode}; +use model::health::HealthReportSources; +use sqlx::PgConnection; + +use crate::DatabaseError; +use crate::db_read::DbReader; + +const TABLE_NAME: &str = "nvlink_domain_health_reports"; + +/// Finds the health report sources stored for an NVLink domain. +pub async fn find( + txn: impl DbReader<'_>, + domain_id: &NvLinkDomainId, +) -> Result, DatabaseError> { + let query = "SELECT health_reports FROM nvlink_domain_health_reports WHERE id = $1"; + let health_reports = sqlx::query_scalar::<_, sqlx::types::Json>(query) + .bind(domain_id) + .fetch_optional(txn) + .await + .map_err(|e| DatabaseError::new(query, e))?; + + Ok(health_reports.map(|json| json.0)) +} + +/// Lists NVLink domain IDs that have stored health reports. +pub async fn list_domain_ids(txn: impl DbReader<'_>) -> Result, DatabaseError> { + let query = "SELECT id FROM nvlink_domain_health_reports ORDER BY id"; + let ids = sqlx::query_scalar::<_, NvLinkDomainId>(query) + .fetch_all(txn) + .await + .map_err(|e| DatabaseError::new(query, e))?; + + Ok(ids) +} + +/// Inserts or updates one health report source for an NVLink domain. +pub async fn insert_health_report( + txn: &mut PgConnection, + domain_id: &NvLinkDomainId, + mode: HealthReportApplyMode, + health_report: &HealthReport, +) -> Result<(), DatabaseError> { + ensure_row(txn, domain_id).await?; + + crate::health_report::insert_health_report(txn, TABLE_NAME, domain_id, mode, health_report) + .await +} + +/// Removes one health report source from an NVLink domain. +pub async fn remove_health_report( + txn: &mut PgConnection, + domain_id: &NvLinkDomainId, + mode: HealthReportApplyMode, + source: &str, +) -> Result<(), DatabaseError> { + crate::health_report::remove_health_report(txn, TABLE_NAME, domain_id, mode, source).await +} + +/// Creates the domain row before applying a JSON health-report update. +async fn ensure_row( + txn: &mut PgConnection, + domain_id: &NvLinkDomainId, +) -> Result<(), DatabaseError> { + // Health reports can arrive before inventory creates an NVLink domain row, + // so inserts lazily create the container row and then use the shared JSON + // update path. + let query = + "INSERT INTO nvlink_domain_health_reports (id) VALUES ($1) ON CONFLICT (id) DO NOTHING"; + + sqlx::query(query) + .bind(domain_id) + .execute(txn) + .await + .map_err(|e| DatabaseError::new(query, e))?; + + Ok(()) +} diff --git a/crates/api/src/api.rs b/crates/api/src/api.rs index 7f32b08289..4e4c4b7eb4 100644 --- a/crates/api/src/api.rs +++ b/crates/api/src/api.rs @@ -622,6 +622,27 @@ impl Forge for Api { crate::handlers::power_shelf::remove_power_shelf_health_report(self, request).await } + async fn list_nv_link_domain_health_reports( + &self, + request: Request, + ) -> Result, Status> { + crate::handlers::nvlink_domain::list_nv_link_domain_health_reports(self, request).await + } + + async fn insert_nv_link_domain_health_report( + &self, + request: Request, + ) -> Result, Status> { + crate::handlers::nvlink_domain::insert_nv_link_domain_health_report(self, request).await + } + + async fn remove_nv_link_domain_health_report( + &self, + request: Request, + ) -> Result, Status> { + crate::handlers::nvlink_domain::remove_nv_link_domain_health_report(self, request).await + } + async fn get_all_domain_metadata( &self, request: Request, diff --git a/crates/api/src/auth/internal_rbac_rules.rs b/crates/api/src/auth/internal_rbac_rules.rs index d68b6e6f0b..488b531bb3 100644 --- a/crates/api/src/auth/internal_rbac_rules.rs +++ b/crates/api/src/auth/internal_rbac_rules.rs @@ -166,6 +166,15 @@ impl InternalRBACRules { x.perm("ListPowerShelfHealthReports", vec![ForgeAdminCLI, Health]); x.perm("InsertPowerShelfHealthReport", vec![ForgeAdminCLI, Health]); x.perm("RemovePowerShelfHealthReport", vec![ForgeAdminCLI, Health]); + x.perm("ListNVLinkDomainHealthReports", vec![ForgeAdminCLI, Health]); + x.perm( + "InsertNVLinkDomainHealthReport", + vec![ForgeAdminCLI, Health], + ); + x.perm( + "RemoveNVLinkDomainHealthReport", + vec![ForgeAdminCLI, Health], + ); // Deprecated aliases for the machine health report RPCs. Mirror the // permissions of their canonical equivalents above. Drop once we're // confident no clients are still calling the old names. diff --git a/crates/api/src/handlers/mod.rs b/crates/api/src/handlers/mod.rs index 2596a50c8d..7b8142811f 100644 --- a/crates/api/src/handlers/mod.rs +++ b/crates/api/src/handlers/mod.rs @@ -62,6 +62,7 @@ pub mod network_security_group; pub mod network_segment; pub mod nmxc_browse; pub mod nvl_partition; +pub mod nvlink_domain; pub mod nvlink_nmxc_endpoints; pub mod operating_system; pub mod power_options; diff --git a/crates/api/src/handlers/nvlink_domain.rs b/crates/api/src/handlers/nvlink_domain.rs new file mode 100644 index 0000000000..030808d849 --- /dev/null +++ b/crates/api/src/handlers/nvlink_domain.rs @@ -0,0 +1,159 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +use ::rpc::forge::{self as rpc, HealthReportEntry}; +use carbide_uuid::nvlink::NvLinkDomainId; +use health_report::HealthReportApplyMode; +use model::health::HealthReportSources; +use tonic::{Request, Response, Status}; + +use crate::CarbideError; +use crate::api::{Api, log_request_data}; +use crate::auth::AuthContext; + +pub async fn list_nv_link_domain_health_reports( + api: &Api, + request: Request, +) -> Result, Status> { + log_request_data(&request); + + let domain_id = request + .into_inner() + .domain_id + .ok_or_else(|| CarbideError::MissingArgument("domain_id"))?; + + let health_reports = + db::nvlink_domain_health_report::find(api.db_reader().as_mut(), &domain_id) + .await? + .unwrap_or_default(); + + Ok(Response::new(list_response(health_reports))) +} + +pub async fn insert_nv_link_domain_health_report( + api: &Api, + request: Request, +) -> Result, Status> { + log_request_data(&request); + + let triggered_by = request + .extensions() + .get::() + .and_then(|ctx| ctx.get_external_user_name()) + .map(String::from); + + let rpc::InsertNvLinkDomainHealthReportRequest { + domain_id, + health_report_entry: Some(rpc::HealthReportEntry { report, mode }), + } = request.into_inner() + else { + return Err(CarbideError::MissingArgument("health_report_entry").into()); + }; + + let domain_id = domain_id.ok_or_else(|| CarbideError::MissingArgument("domain_id"))?; + + let Some(report) = report else { + return Err(CarbideError::MissingArgument("report").into()); + }; + + let Ok(mode) = rpc::HealthReportApplyMode::try_from(mode) else { + return Err(CarbideError::InvalidArgument("mode".to_string()).into()); + }; + + let mode: HealthReportApplyMode = mode.into(); + + let mut txn = api.txn_begin().await?; + let health_reports = db::nvlink_domain_health_report::find(&mut txn, &domain_id) + .await? + .unwrap_or_default(); + + let mut report = health_report::HealthReport::try_from(report.clone()) + .map_err(|e| CarbideError::internal(e.to_string()))?; + + if report.observed_at.is_none() { + report.observed_at = Some(chrono::Utc::now()); + } + report.triggered_by = triggered_by; + report.update_in_alert_since(None); + + match remove_by_source(&mut txn, &domain_id, &health_reports, report.source.clone()).await { + Ok(_) | Err(CarbideError::NotFoundError { .. }) => {} + Err(e) => return Err(e.into()), + } + + db::nvlink_domain_health_report::insert_health_report(&mut txn, &domain_id, mode, &report) + .await?; + + txn.commit().await?; + + Ok(Response::new(())) +} + +pub async fn remove_nv_link_domain_health_report( + api: &Api, + request: Request, +) -> Result, Status> { + log_request_data(&request); + + let rpc::RemoveNvLinkDomainHealthReportRequest { domain_id, source } = request.into_inner(); + let domain_id = domain_id.ok_or_else(|| CarbideError::MissingArgument("domain_id"))?; + + let mut txn = api.txn_begin().await?; + let health_reports = db::nvlink_domain_health_report::find(&mut txn, &domain_id) + .await? + .unwrap_or_default(); + + remove_by_source(&mut txn, &domain_id, &health_reports, source).await?; + txn.commit().await?; + + Ok(Response::new(())) +} + +async fn remove_by_source( + txn: &mut db::Transaction<'_>, + domain_id: &NvLinkDomainId, + health_reports: &HealthReportSources, + source: String, +) -> Result<(), CarbideError> { + let mode = if health_reports.replace.as_ref().map(|o| &o.source) == Some(&source) { + HealthReportApplyMode::Replace + } else if health_reports.merges.contains_key(&source) { + HealthReportApplyMode::Merge + } else { + return Err(CarbideError::NotFoundError { + kind: "NVLink domain health report with source", + id: source, + }); + }; + + db::nvlink_domain_health_report::remove_health_report(&mut *txn, domain_id, mode, &source) + .await?; + + Ok(()) +} + +fn list_response(health_reports: HealthReportSources) -> rpc::ListHealthReportResponse { + rpc::ListHealthReportResponse { + health_report_entries: health_reports + .into_iter() + .map(|o| HealthReportEntry { + report: Some(o.0.into()), + mode: o.1 as i32, + }) + .collect(), + } +} diff --git a/crates/api/src/tests/mod.rs b/crates/api/src/tests/mod.rs index af8893c89d..e7ee86dbb9 100644 --- a/crates/api/src/tests/mod.rs +++ b/crates/api/src/tests/mod.rs @@ -90,6 +90,7 @@ mod network_segment_find; mod network_segment_lifecycle; mod nvl_instance; mod nvl_logical_partition; +mod nvlink_domain_health; mod operating_system; mod power_shelf; mod power_shelf_find; diff --git a/crates/api/src/tests/nvlink_domain_health.rs b/crates/api/src/tests/nvlink_domain_health.rs new file mode 100644 index 0000000000..857a838ed9 --- /dev/null +++ b/crates/api/src/tests/nvlink_domain_health.rs @@ -0,0 +1,132 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +use carbide_uuid::nvlink::NvLinkDomainId; +use health_report::{HealthAlertClassification, HealthProbeAlert, HealthReport}; +use rpc::forge::forge_server::Forge; +use rpc::forge::{self as rpc_forge}; +use tonic::Request; + +use crate::tests::common::api_fixtures::create_test_env; + +fn alert_report(source: &str) -> Result> { + Ok(HealthReport { + source: source.to_string(), + triggered_by: None, + observed_at: Some(chrono::Utc::now()), + successes: vec![], + alerts: vec![HealthProbeAlert { + id: "NvLinkDomainUnhealthy".parse()?, + target: None, + in_alert_since: Some(chrono::Utc::now()), + message: "NVLink domain health issue detected".to_string(), + tenant_message: None, + classifications: vec![ + HealthAlertClassification::prevent_allocations(), + HealthAlertClassification::hardware(), + ], + }], + }) +} + +#[crate::sqlx_test] +async fn test_insert_list_remove_nvlink_domain_health_report( + pool: sqlx::PgPool, +) -> Result<(), Box> { + let env = create_test_env(pool.clone()).await; + let domain_id: NvLinkDomainId = "00000000-0000-0000-0000-000000000001".parse()?; + let report = alert_report("external-monitor")?; + + env.api + .insert_nv_link_domain_health_report(Request::new( + rpc_forge::InsertNvLinkDomainHealthReportRequest { + domain_id: Some(domain_id), + health_report_entry: Some(rpc_forge::HealthReportEntry { + report: Some(report.clone().into()), + mode: rpc_forge::HealthReportApplyMode::Merge as i32, + }), + }, + )) + .await?; + + let list_resp = env + .api + .list_nv_link_domain_health_reports(Request::new( + rpc_forge::ListNvLinkDomainHealthReportsRequest { + domain_id: Some(domain_id), + }, + )) + .await? + .into_inner(); + + assert_eq!(list_resp.health_report_entries.len(), 1); + + let listed_entry = list_resp.health_report_entries[0].clone(); + let listed_report: HealthReport = listed_entry.report.ok_or("missing report")?.try_into()?; + + assert_eq!(listed_report.source, "external-monitor"); + assert_eq!(listed_report.alerts.len(), 1); + + env.api + .remove_nv_link_domain_health_report(Request::new( + rpc_forge::RemoveNvLinkDomainHealthReportRequest { + domain_id: Some(domain_id), + source: "external-monitor".to_string(), + }, + )) + .await?; + + let list_resp = env + .api + .list_nv_link_domain_health_reports(Request::new( + rpc_forge::ListNvLinkDomainHealthReportsRequest { + domain_id: Some(domain_id), + }, + )) + .await? + .into_inner(); + + assert_eq!(list_resp.health_report_entries.len(), 0); + + Ok(()) +} + +#[crate::sqlx_test] +async fn test_remove_nonexistent_nvlink_domain_health_report_source( + pool: sqlx::PgPool, +) -> Result<(), Box> { + let env = create_test_env(pool.clone()).await; + let domain_id: NvLinkDomainId = "00000000-0000-0000-0000-000000000002".parse()?; + + let result = env + .api + .remove_nv_link_domain_health_report(Request::new( + rpc_forge::RemoveNvLinkDomainHealthReportRequest { + domain_id: Some(domain_id), + source: "nonexistent-source".to_string(), + }, + )) + .await; + + assert!(result.is_err()); + + let status = result.err().ok_or("missing error")?; + + assert_eq!(status.code(), tonic::Code::NotFound); + + Ok(()) +} diff --git a/crates/api/src/tests/web/health.rs b/crates/api/src/tests/web/health.rs index 3300b1ac49..999628bb4c 100644 --- a/crates/api/src/tests/web/health.rs +++ b/crates/api/src/tests/web/health.rs @@ -116,6 +116,50 @@ async fn test_add_remove_health_report_via_web_ui(pool: sqlx::PgPool) { assert!(!body.contains("web-health-test")); } +#[crate::sqlx_test] +async fn test_add_remove_nvlink_domain_health_report_via_web_ui(pool: sqlx::PgPool) { + let env = create_test_env(pool).await; + let app = make_test_app(&env); + let domain_id = "00000000-0000-0000-0000-000000000001"; + + let payload = r#"{ + "mode": "Merge", + "health_report": { + "source": "web-nvlink-domain-health-test", + "triggered_by": null, + "observed_at": null, + "successes": [], + "alerts": [{ + "id": "NvLinkDomainWebHealth", + "target": null, + "in_alert_since": null, + "message": "nvlink domain web health", + "tenant_message": null, + "classifications": ["PreventAllocations"] + }] + } + }"#; + + post_nvlink_domain_health_report(&app, domain_id, "add-report", payload).await; + + let body = get_nvlink_domain_health_page(&app, domain_id).await; + let aggregate_health = aggregate_health_section(&body); + assert!(aggregate_health.contains("NvLinkDomainWebHealth")); + assert!(aggregate_health.contains("nvlink domain web health")); + assert!(body.contains("web-nvlink-domain-health-test")); + + post_nvlink_domain_health_report( + &app, + domain_id, + "remove-report", + r#"{"source":"web-nvlink-domain-health-test"}"#, + ) + .await; + + let body = get_nvlink_domain_health_page(&app, domain_id).await; + assert!(!body.contains("web-nvlink-domain-health-test")); +} + #[crate::sqlx_test] async fn test_add_replace_remove_dpu_health_report_via_web_ui(pool: sqlx::PgPool) { let env = create_test_env(pool).await; @@ -636,6 +680,49 @@ async fn get_machine_health_page(app: &axum::Router, machine_id: &str) -> String String::from_utf8_lossy(&body_bytes).into_owned() } +async fn post_nvlink_domain_health_report( + app: &axum::Router, + domain_id: &str, + action: &str, + payload: &str, +) { + let response = app + .clone() + .oneshot( + web_request_builder() + .method(Method::POST) + .uri(format!("/admin/nvlink-domain/{domain_id}/health/{action}")) + .header("Content-Type", "application/json") + .body(Body::from(payload.to_string())) + .unwrap(), + ) + .await + .unwrap(); + assert_eq!(response.status(), StatusCode::OK); +} + +async fn get_nvlink_domain_health_page(app: &axum::Router, domain_id: &str) -> String { + let response = app + .clone() + .oneshot( + web_request_builder() + .uri(format!("/admin/nvlink-domain/{domain_id}/health")) + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + assert_eq!(response.status(), StatusCode::OK); + + let body_bytes = response + .into_body() + .collect() + .await + .expect("Empty response body?") + .to_bytes(); + String::from_utf8_lossy(&body_bytes).into_owned() +} + fn aggregate_health_section(body: &str) -> &str { let active_reports_start = body .find("

Active Health Reports

") diff --git a/crates/api/src/web/health.rs b/crates/api/src/web/health.rs index 692f4f21e1..bfcfb34729 100644 --- a/crates/api/src/web/health.rs +++ b/crates/api/src/web/health.rs @@ -20,8 +20,9 @@ use std::sync::Arc; use askama::Template; use axum::extract::{self, Path as AxumPath, State as AxumState}; -use axum::response::{Html, IntoResponse, Response}; +use axum::response::{Html, IntoResponse, Redirect, Response}; use carbide_uuid::machine::MachineId; +use carbide_uuid::nvlink::NvLinkDomainId; use carbide_uuid::power_shelf::PowerShelfId; use carbide_uuid::rack::RackId; use carbide_uuid::switch::SwitchId; @@ -29,12 +30,14 @@ use health_report::HealthReport; use hyper::http::StatusCode; use rpc::forge::forge_server::Forge; use rpc::forge::{ - HealthReportApplyMode, InsertMachineHealthReportRequest, InsertPowerShelfHealthReportRequest, - InsertRackHealthReportRequest, InsertSwitchHealthReportRequest, + HealthReportApplyMode, InsertMachineHealthReportRequest, InsertNvLinkDomainHealthReportRequest, + InsertPowerShelfHealthReportRequest, InsertRackHealthReportRequest, + InsertSwitchHealthReportRequest, ListNvLinkDomainHealthReportsRequest, ListPowerShelfHealthReportsRequest, ListRackHealthReportsRequest, ListSwitchHealthReportsRequest, MachinesByIdsRequest, PowerShelvesByIdsRequest, - RacksByIdsRequest, RemoveMachineHealthReportRequest, RemovePowerShelfHealthReportRequest, - RemoveRackHealthReportRequest, RemoveSwitchHealthReportRequest, SwitchesByIdsRequest, + RacksByIdsRequest, RemoveMachineHealthReportRequest, RemoveNvLinkDomainHealthReportRequest, + RemovePowerShelfHealthReportRequest, RemoveRackHealthReportRequest, + RemoveSwitchHealthReportRequest, SwitchesByIdsRequest, }; use super::{Base, filters}; @@ -89,6 +92,7 @@ struct LabeledHealthReport { #[derive(Clone)] enum HealthObject { Machine(MachineId), + NvLinkDomain(NvLinkDomainId), PowerShelf(PowerShelfId), Rack(RackId), Switch(SwitchId), @@ -98,6 +102,7 @@ impl HealthObject { fn id(&self) -> String { match self { HealthObject::Machine(id) => id.to_string(), + HealthObject::NvLinkDomain(id) => id.to_string(), HealthObject::PowerShelf(id) => id.to_string(), HealthObject::Rack(id) => id.to_string(), HealthObject::Switch(id) => id.to_string(), @@ -107,6 +112,7 @@ impl HealthObject { fn kind(&self) -> &'static str { match self { HealthObject::Machine(_) => "machine", + HealthObject::NvLinkDomain(_) => "nvlink-domain", HealthObject::PowerShelf(_) => "power-shelf", HealthObject::Rack(_) => "rack", HealthObject::Switch(_) => "switch", @@ -116,6 +122,7 @@ impl HealthObject { fn label(&self) -> String { match self { HealthObject::Machine(id) => id.machine_type().to_string(), + HealthObject::NvLinkDomain(_) => "NVLink Domain".to_string(), HealthObject::PowerShelf(_) => "Power Shelf".to_string(), HealthObject::Rack(_) => "Rack".to_string(), HealthObject::Switch(_) => "Switch".to_string(), @@ -126,9 +133,15 @@ impl HealthObject { format!("/admin/{}/{}", self.kind(), self.id()) } + /// Returns the admin health page URL for this object. + fn health_url(&self) -> String { + format!("{}/health", self.detail_url()) + } + fn history_url(&self) -> Option { match self { HealthObject::Machine(id) => Some(format!("/admin/machine/{id}/health-history")), + HealthObject::NvLinkDomain(_) => None, HealthObject::PowerShelf(_) => None, HealthObject::Rack(_) => None, HealthObject::Switch(_) => None, @@ -144,6 +157,11 @@ impl HealthObject { } } +/// Builds the admin health URL for an NVLink domain. +pub(super) fn nvlink_domain_health_url(domain_id: &NvLinkDomainId) -> String { + HealthObject::NvLinkDomain(*domain_id).health_url() +} + struct HealthPageData { entries: Vec<::rpc::forge::HealthReportEntry>, aggregate_health: Option, @@ -224,6 +242,34 @@ pub async fn switch_health( render_health(HealthObject::Switch(switch_id), data) } +/// View NVLink domain health. +pub async fn nvlink_domain_health( + AxumState(state): AxumState>, + AxumPath(domain_id): AxumPath, +) -> Response { + let Ok(domain_id) = NvLinkDomainId::from_str(&domain_id) else { + return (StatusCode::BAD_REQUEST, "invalid NVLink domain id").into_response(); + }; + + let data = match fetch_nvlink_domain_health_page_data(&state, &domain_id).await { + Ok(data) => data, + Err(response) => return response, + }; + + render_health(HealthObject::NvLinkDomain(domain_id), data) +} + +/// Redirect NVLink domain details to the health page. +pub async fn nvlink_domain_detail(AxumPath(domain_id): AxumPath) -> Response { + let Ok(domain_id) = NvLinkDomainId::from_str(&domain_id) else { + return (StatusCode::BAD_REQUEST, "invalid NVLink domain id").into_response(); + }; + + let health_url = nvlink_domain_health_url(&domain_id); + + Redirect::to(&health_url).into_response() +} + fn render_health(object: HealthObject, data: HealthPageData) -> Response { let mut entries = data.entries; // Sort by type first and source name second. @@ -365,6 +411,30 @@ async fn fetch_switch_health_page_data( }) } +async fn fetch_nvlink_domain_health_page_data( + api: &Api, + domain_id: &NvLinkDomainId, +) -> Result { + let entries = match list_nvlink_domain_health_report_entries(api, domain_id).await { + Ok(entries) => entries, + Err(err) if err.code() == tonic::Code::NotFound => Vec::new(), + Err(err) => { + tracing::error!(%err, %domain_id, "list_nvlink_domain_health_reports"); + return Err((StatusCode::INTERNAL_SERVER_ERROR, Html(err.to_string())).into_response()); + } + }; + let aggregate_health = aggregate_health_report_entries(&entries); + + Ok(HealthPageData { + entries, + aggregate_health, + health_contributors: Vec::new(), + history: HealthHistoryTable { + records: Vec::new(), + }, + }) +} + async fn fetch_dpu_health_contributors( api: &Api, host_machine_id: &MachineId, @@ -453,6 +523,21 @@ async fn list_switch_health_report_entries( .health_report_entries) } +async fn list_nvlink_domain_health_report_entries( + api: &Api, + domain_id: &NvLinkDomainId, +) -> Result, tonic::Status> { + Ok(api + .list_nv_link_domain_health_reports(tonic::Request::new( + ListNvLinkDomainHealthReportsRequest { + domain_id: Some(*domain_id), + }, + )) + .await? + .into_inner() + .health_report_entries) +} + async fn fetch_machine_health_snapshot( api: &Api, machine_id: &MachineId, @@ -729,6 +814,27 @@ pub async fn add_switch_health_report( .await } +/// Insert an NVLink domain health report from the admin web UI. +pub async fn add_nvlink_domain_health_report( + AxumState(state): AxumState>, + AxumPath(domain_id): AxumPath, + auth_context: Option>, + extract::Json(payload): extract::Json, +) -> Response { + let domain_id = match domain_id.parse::() { + Ok(id) => id, + Err(e) => return (StatusCode::BAD_REQUEST, e.to_string()).into_response(), + }; + + add_health_report_for( + state, + HealthObject::NvLinkDomain(domain_id), + auth_context, + payload, + ) + .await +} + async fn add_health_report_for( state: Arc, object: HealthObject, @@ -756,6 +862,19 @@ async fn add_health_report_for( .await .map(|response| response.into_inner()) } + HealthObject::NvLinkDomain(domain_id) => { + let mut request = tonic::Request::new(InsertNvLinkDomainHealthReportRequest { + domain_id: Some(domain_id), + health_report_entry: Some(entry), + }); + if let Some(axum::Extension(auth_context)) = auth_context { + request.extensions_mut().insert(auth_context); + } + state + .insert_nv_link_domain_health_report(request) + .await + .map(|response| response.into_inner()) + } HealthObject::PowerShelf(power_shelf_id) => { let mut request = tonic::Request::new(InsertPowerShelfHealthReportRequest { power_shelf_id: Some(power_shelf_id), @@ -861,6 +980,20 @@ pub async fn remove_switch_health_report( remove_health_report_for(state, HealthObject::Switch(switch_id), payload).await } +/// Remove an NVLink domain health report from the admin web UI. +pub async fn remove_nvlink_domain_health_report( + AxumState(state): AxumState>, + AxumPath(domain_id): AxumPath, + extract::Json(payload): extract::Json, +) -> Response { + let domain_id = match domain_id.parse::() { + Ok(id) => id, + Err(e) => return (StatusCode::BAD_REQUEST, e.to_string()).into_response(), + }; + + remove_health_report_for(state, HealthObject::NvLinkDomain(domain_id), payload).await +} + async fn remove_health_report_for( state: Arc, object: HealthObject, @@ -876,6 +1009,15 @@ async fn remove_health_report_for( })) .await .map(|response| response.into_inner()), + HealthObject::NvLinkDomain(domain_id) => state + .remove_nv_link_domain_health_report(tonic::Request::new( + RemoveNvLinkDomainHealthReportRequest { + domain_id: Some(domain_id), + source: payload.source, + }, + )) + .await + .map(|response| response.into_inner()), HealthObject::PowerShelf(power_shelf_id) => state .remove_power_shelf_health_report(tonic::Request::new( RemovePowerShelfHealthReportRequest { @@ -920,6 +1062,30 @@ fn health_report_from_rpc_convert_invalid( .unwrap_or_else(health_report::HealthReport::malformed_report) } +fn aggregate_health_report_entries( + entries: &[::rpc::forge::HealthReportEntry], +) -> Option { + let mut aggregate = HealthReport::empty("aggregate-nvlink-domain-health".to_string()); + let mut has_merge = false; + let mut replace = None; + + for entry in entries { + let Some(report) = entry.report.as_ref() else { + continue; + }; + let report = health_report_from_rpc_convert_invalid(report.clone()); + + if entry.mode() == HealthReportApplyMode::Replace { + replace = Some(report); + } else { + aggregate.merge(&report); + has_merge = true; + } + } + + replace.or_else(|| has_merge.then_some(aggregate)) +} + pub(super) async fn fetch_health_history( api: &Api, machine_id: &MachineId, diff --git a/crates/api/src/web/machine.rs b/crates/api/src/web/machine.rs index a0d664ed64..1d2675476e 100644 --- a/crates/api/src/web/machine.rs +++ b/crates/api/src/web/machine.rs @@ -33,7 +33,7 @@ use serde::Deserialize; use super::pagination::{self, PageContext, PaginationParams}; use super::state_history::StateHistoryTable; -use super::{Base, filters}; +use super::{Base, filters, health}; use crate::api::Api; use crate::web::action_status::{self, ActionStatus}; @@ -502,6 +502,7 @@ struct MachineIbInterfaceDisplay { #[derive(Debug, Default)] struct MachineNvLinkGpuDisplay { domain_uuid: String, + domain_health_url: String, tray_index: i32, slot_id: i32, device_instance: i32, @@ -619,11 +620,16 @@ impl From for MachineDetail<'_> { } if let Some(nvlink_info) = m.nvlink_info { + let domain_id = nvlink_info.domain_uuid.unwrap_or_default(); + let domain_uuid = domain_id.to_string(); + let domain_health_url = health::nvlink_domain_health_url(&domain_id); + nvlink_gpus = nvlink_info .gpus .into_iter() .map(|gpu| MachineNvLinkGpuDisplay { - domain_uuid: nvlink_info.domain_uuid.unwrap_or_default().to_string(), + domain_uuid: domain_uuid.clone(), + domain_health_url: domain_health_url.clone(), tray_index: gpu.tray_index, slot_id: gpu.slot_id, guid: gpu.guid, diff --git a/crates/api/src/web/mod.rs b/crates/api/src/web/mod.rs index e840d7e382..31e3773948 100644 --- a/crates/api/src/web/mod.rs +++ b/crates/api/src/web/mod.rs @@ -649,6 +649,22 @@ pub fn routes(api: Arc) -> eyre::Result> { "/machine/{machine_id}/health/remove-report", post(health::remove_machine_health_report), ) + .route( + "/nvlink-domain/{domain_id}", + get(health::nvlink_domain_detail), + ) + .route( + "/nvlink-domain/{domain_id}/health", + get(health::nvlink_domain_health), + ) + .route( + "/nvlink-domain/{domain_id}/health/add-report", + post(health::add_nvlink_domain_health_report), + ) + .route( + "/nvlink-domain/{domain_id}/health/remove-report", + post(health::remove_nvlink_domain_health_report), + ) .route( "/machine/{machine_id}/attestation-results", get(attestation::show_attestation_results), @@ -718,6 +734,14 @@ pub fn routes(api: Arc) -> eyre::Result> { ) .route("/operating-system/{os_id}", get(operating_system::detail)) .route("/nmxc-browser", get(nmxc_browser::query)) + .route( + "/nvlink-domain", + get(nvlink::show_nvlink_domain_health_html), + ) + .route( + "/nvlink-domain.json", + get(nvlink::show_nvlink_domain_health_json), + ) .route( "/nvlink-partition", get(nvlink::show_nvlink_logical_partitions_html), diff --git a/crates/api/src/web/nvlink.rs b/crates/api/src/web/nvlink.rs index 13cac50e18..b880d63f35 100644 --- a/crates/api/src/web/nvlink.rs +++ b/crates/api/src/web/nvlink.rs @@ -20,15 +20,16 @@ use std::sync::Arc; use askama::Template; use axum::Json; -use axum::extract::{Path as AxumPath, State as AxumState}; +use axum::extract::{OriginalUri, Path as AxumPath, Query, State as AxumState}; use axum::response::{Html, IntoResponse, Response}; -use carbide_uuid::nvlink::NvLinkPartitionId; +use carbide_uuid::nvlink::{NvLinkDomainId, NvLinkPartitionId}; use hyper::http::StatusCode; use rpc::forge as forgerpc; use rpc::forge::forge_server::Forge; use uuid::Uuid; -use super::Base; +use super::pagination::{self, PageContext, PaginationParams}; +use super::{Base, health}; use crate::api::Api; #[derive(serde::Serialize, Template)] @@ -45,6 +46,29 @@ struct LogicalPartitionRowDisplay { physical_partitions: usize, } +#[derive(Template)] +#[template(path = "nvlink_domain_health_show.html")] +struct NvLinkDomainHealthShow { + domains: Vec, + json_path: String, + search_query: String, + page: PageContext, +} + +#[derive(serde::Serialize)] +struct NvLinkDomainHealthRow { + id: String, + health_url: String, +} + +#[derive(serde::Deserialize, Debug, Default)] +pub(super) struct NvLinkDomainHealthParams { + #[serde(flatten)] + pagination: PaginationParams, + #[serde(default)] + q: String, +} + #[derive(serde::Serialize, Clone)] struct ShowLogicalPartition { partition: forgerpc::NvLinkLogicalPartition, @@ -80,9 +104,21 @@ impl From for LogicalPartitionRowDisplay { } } } + +impl From for NvLinkDomainHealthRow { + fn from(id: NvLinkDomainId) -> Self { + Self { + id: id.to_string(), + health_url: health::nvlink_domain_health_url(&id), + } + } +} + #[derive(serde::Serialize, Clone)] struct ShowPhysicalPartitionDetail { id: String, + domain_uuid: String, + domain_health_url: String, name: String, nmx_m_id: String, members: Vec, @@ -102,14 +138,24 @@ impl From for LogicalPartitionDetail { fn from(show: ShowLogicalPartition) -> Self { let mut physical_partitions = Vec::new(); for s in show.physical_partitions { + let domain_uuid = s.partition.domain_uuid; + let domain_health_url = domain_uuid + .as_ref() + .map(health::nvlink_domain_health_url) + .unwrap_or_default(); + let pp = ShowPhysicalPartitionDetail { id: s.partition.id.map(|i| i.to_string()).unwrap_or_default(), + domain_uuid: domain_uuid.map(|id| id.to_string()).unwrap_or_default(), + domain_health_url, name: s.partition.name, nmx_m_id: s.partition.nmx_m_id, members: s.members, }; + physical_partitions.push(pp); } + let created = show .partition .created @@ -173,6 +219,67 @@ pub async fn show_nvlink_logical_partitions_json( (StatusCode::OK, Json(partitions)).into_response() } +/// List NVLink domains with health reports. +pub async fn show_nvlink_domain_health_html( + AxumState(state): AxumState>, + Query(params): Query, + uri: OriginalUri, +) -> Response { + let rows = match fetch_nvlink_domain_health_rows(&state).await { + Ok(rows) => rows, + Err(err) => { + tracing::error!(%err, "fetch_nvlink_domain_health_rows"); + return ( + StatusCode::INTERNAL_SERVER_ERROR, + "Error loading NVLink domain health", + ) + .into_response(); + } + }; + + let search_query = params.q.trim().to_string(); + let rows = filter_nvlink_domain_health_rows(rows, &search_query); + let extra_query_params = domain_health_extra_query_params(&search_query); + let (info, domains) = pagination::paginate_vec(rows, ¶ms.pagination); + let path = uri.path(); + + let tmpl = NvLinkDomainHealthShow { + domains, + json_path: format!("{path}.json"), + search_query, + page: PageContext::new(info, path).with_extra_params(extra_query_params), + }; + + match tmpl.render() { + Ok(html) => (StatusCode::OK, Html(html)).into_response(), + Err(err) => { + tracing::error!(%err, "render_nvlink_domain_health"); + ( + StatusCode::INTERNAL_SERVER_ERROR, + "Error rendering NVLink domain health", + ) + .into_response() + } + } +} + +/// List NVLink domains with health reports as JSON. +pub async fn show_nvlink_domain_health_json(AxumState(state): AxumState>) -> Response { + let rows = match fetch_nvlink_domain_health_rows(&state).await { + Ok(rows) => rows, + Err(err) => { + tracing::error!(%err, "fetch_nvlink_domain_health_rows"); + return ( + StatusCode::INTERNAL_SERVER_ERROR, + Json("Error loading NVLink domain health".to_string()), + ) + .into_response(); + } + }; + + (StatusCode::OK, Json(rows)).into_response() +} + /// View Logical Partition details pub async fn detail( AxumState(state): AxumState>, @@ -215,6 +322,40 @@ pub async fn detail( (StatusCode::OK, Html(tmpl.render().unwrap())).into_response() } +/// Fetches NVLink domain rows from the health-report table. +async fn fetch_nvlink_domain_health_rows( + api: &Api, +) -> Result, db::DatabaseError> { + let ids = db::nvlink_domain_health_report::list_domain_ids(api.db_reader().as_mut()).await?; + + Ok(ids.into_iter().map(Into::into).collect()) +} + +/// Filters NVLink domain rows by case-insensitive domain ID substring. +fn filter_nvlink_domain_health_rows( + rows: Vec, + search_query: &str, +) -> Vec { + if search_query.is_empty() { + return rows; + } + + let search_query = search_query.to_ascii_lowercase(); + + rows.into_iter() + .filter(|row| row.id.contains(&search_query)) + .collect() +} + +/// Builds query parameters that pagination links must preserve. +fn domain_health_extra_query_params(search_query: &str) -> String { + if search_query.is_empty() { + String::new() + } else { + format!("&q={}", urlencoding::encode(search_query)) + } +} + async fn fetch_logical_partitions( api: Arc, detail: bool, @@ -383,4 +524,5 @@ async fn fetch_logical_partitions( } impl super::Base for LogicalPartitionShow {} +impl super::Base for NvLinkDomainHealthShow {} impl super::Base for LogicalPartitionDetail {} diff --git a/crates/api/templates/base.html b/crates/api/templates/base.html index b4da27df53..a1461f7d0c 100644 --- a/crates/api/templates/base.html +++ b/crates/api/templates/base.html @@ -86,6 +86,7 @@

InfiniBand

NVLink


diff --git a/crates/api/templates/logical_partition_detail.html b/crates/api/templates/logical_partition_detail.html index 668ab64ae9..1f23e1e796 100644 --- a/crates/api/templates/logical_partition_detail.html +++ b/crates/api/templates/logical_partition_detail.html @@ -18,9 +18,10 @@

Physical Partitions

ID + Domain UUID Name NMX-M-ID - Members> + Members @@ -28,6 +29,7 @@

Physical Partitions

{% for p in physical_partitions %} {{ p.id }} + {{ p.domain_uuid }} {{ p.name }} {{ p.nmx_m_id }} @@ -60,7 +62,7 @@

Physical Partitions

+ {% endblock %} - diff --git a/crates/api/templates/machine_detail.html b/crates/api/templates/machine_detail.html index 4d48576ac2..3087dcfedd 100644 --- a/crates/api/templates/machine_detail.html +++ b/crates/api/templates/machine_detail.html @@ -324,7 +324,7 @@

NVLink GPUs

{% for gpu in nvlink_gpus %} {{ gpu.device_instance}} - {{ gpu.domain_uuid }} + {{ gpu.domain_uuid }} {{ gpu.guid }} {{ gpu.tray_index }} {{ gpu.slot_id }} diff --git a/crates/api/templates/nvlink_domain_health_show.html b/crates/api/templates/nvlink_domain_health_show.html new file mode 100644 index 0000000000..5c8f450ba9 --- /dev/null +++ b/crates/api/templates/nvlink_domain_health_show.html @@ -0,0 +1,48 @@ +{% extends "base.html" %} + +{% block title %}NVLink Domain Health{% endblock %} + +{% block content %} + +

NVLink Domain Health ({{ page.total_items() }})

+ +
+ + + + + +
+ + + + {% if search_query != "" %} + Clear + {% endif %} +
+
+ + + + + + + + {% for domain in domains %} + + + + + {% endfor %} + + + + + + + +
Domain IDHealth Reports
{{ domain.id }}Health reports
{{ domains.len() }} item{% if domains.len() != 1 %}s{% endif %}
+ +{% include "pages_footer.html" %} + +{% endblock %} diff --git a/crates/rpc/proto/forge.proto b/crates/rpc/proto/forge.proto index 091f475abc..e06f13f252 100644 --- a/crates/rpc/proto/forge.proto +++ b/crates/rpc/proto/forge.proto @@ -162,6 +162,12 @@ service Forge { rpc InsertPowerShelfHealthReport(InsertPowerShelfHealthReportRequest) returns (google.protobuf.Empty); // Removes a health report source for a Power Shelf rpc RemovePowerShelfHealthReport(RemovePowerShelfHealthReportRequest) returns (google.protobuf.Empty); + // Lists all health report sources for an NVLink domain + rpc ListNVLinkDomainHealthReports(ListNVLinkDomainHealthReportsRequest) returns (ListHealthReportResponse); + // Adds a health report source for an NVLink domain + rpc InsertNVLinkDomainHealthReport(InsertNVLinkDomainHealthReportRequest) returns (google.protobuf.Empty); + // Removes a health report source for an NVLink domain + rpc RemoveNVLinkDomainHealthReport(RemoveNVLinkDomainHealthReportRequest) returns (google.protobuf.Empty); // Deprecated aliases for the machine health report RPCs. // These exist so older clients (admin-cli binaries, scripts pinned to the @@ -4889,6 +4895,23 @@ message RemoveMachineHealthReportRequest { string source = 2; } +// Request to list health report sources for a NVLink domain. +message ListNVLinkDomainHealthReportsRequest { + optional common.NVLinkDomainId domain_id = 1; +} + +// Request to insert a health report source for a NVLink domain. +message InsertNVLinkDomainHealthReportRequest { + common.NVLinkDomainId domain_id = 1; + HealthReportEntry health_report_entry = 2; +} + +// Request to remove a health report source for a NVLink domain. +message RemoveNVLinkDomainHealthReportRequest { + common.NVLinkDomainId domain_id = 1; + string source = 2; +} + // Observed status of a single network interface of an instance message InstanceInterfaceStatusObservation { // Whether the interface is a physical or virtual function