diff --git a/app/api/metrics/route.ts b/app/api/metrics/route.ts index f3cf6d1..d59b806 100644 --- a/app/api/metrics/route.ts +++ b/app/api/metrics/route.ts @@ -1,6 +1,6 @@ import { NextResponse } from "next/server"; import * as si from "systeminformation"; -import { detectROCm } from "@/lib/system/rocm"; +import { detectROCm, getAdvancedAmdGpuMetrics, isAmdSmiAvailable } from "@/lib/system/rocm"; import type { SystemMetrics } from "@/types/metrics"; export const dynamic = "force-dynamic"; @@ -138,40 +138,72 @@ interface GpuOutput { } async function getGpuMetrics(): Promise<{ gpus: GpuOutput[]; rocmDetected: boolean; rocmRuntimeVersion: string }> { + const amdSmiAvail = await isAmdSmiAvailable(); + // First try ROCm detection try { const rocData = await detectROCm(); if (rocData.gpus && rocData.gpus.length > 0) { + // Fetch advanced metrics for each GPU if amd-smi is available + const enhancedGpus = await Promise.all( + rocData.gpus.map(async (gpu) => { + const baseGpu = { + index: gpu.index, + name: gpu.name, + marketingName: gpu.marketingName, + vendor: gpu.vendor, + usage: gpu.usage ?? 0, + memory: gpu.memory || { total: 0, used: 0 }, + gttMemory: gpu.gttMemory, + temperature: gpu.temperature ?? null, + temperatureHotspot: gpu.temperatureHotspot ?? null, + temperatureMem: gpu.temperatureMem ?? null, + power: gpu.power ?? null, + driverVersion: gpu.driverVersion || "Unknown", + gfxVersion: gpu.gfxVersion, + deviceId: gpu.deviceId || "N/A", + computeUnits: gpu.computeUnits, + maxClockMHz: gpu.maxClockMHz, + currentClockMHz: gpu.currentClockMHz || 0, + memoryClockMHz: gpu.memoryClockMHz ?? null, + vbiosVersion: gpu.vbiosVersion, + pciBus: gpu.pciBus, + vramType: gpu.vramType, + vramBitWidth: gpu.vramBitWidth, + pcieWidth: gpu.pcieWidth ?? null, + pcieSpeed: gpu.pcieSpeed ?? null, + eccCorrectable: gpu.eccCorrectable ?? null, + eccUncorrectable: gpu.eccUncorrectable ?? null, + isThrottling: gpu.isThrottling ?? false, + }; + + // Add advanced metrics if amd-smi is available + if (amdSmiAvail) { + try { + const advancedMetrics = await getAdvancedAmdGpuMetrics(gpu.index); + return { + ...baseGpu, + engineUtilization: advancedMetrics.engineMetrics, + thermal: advancedMetrics.thermalMetrics, + powerMetrics: advancedMetrics.powerMetrics, + clocks: advancedMetrics.clockMetrics, + pcieMetrics: advancedMetrics.pcieMetrics, + xgmiMetrics: advancedMetrics.xgmiMetrics, + mediaEngines: advancedMetrics.mediaMetrics, + eccMetrics: advancedMetrics.eccMetrics, + }; + } catch { + // Advanced metrics failed, return base GPU + return baseGpu; + } + } + + return baseGpu; + }) + ); + return { - gpus: rocData.gpus.map((gpu) => ({ - index: gpu.index, - name: gpu.name, - marketingName: gpu.marketingName, - vendor: gpu.vendor, - usage: gpu.usage ?? 0, - memory: gpu.memory || { total: 0, used: 0 }, - gttMemory: gpu.gttMemory, - temperature: gpu.temperature ?? null, - temperatureHotspot: gpu.temperatureHotspot ?? null, - temperatureMem: gpu.temperatureMem ?? null, - power: gpu.power ?? null, - driverVersion: gpu.driverVersion || "Unknown", - gfxVersion: gpu.gfxVersion, - deviceId: gpu.deviceId || "N/A", - computeUnits: gpu.computeUnits, - maxClockMHz: gpu.maxClockMHz, - currentClockMHz: gpu.currentClockMHz || 0, - memoryClockMHz: gpu.memoryClockMHz ?? null, - vbiosVersion: gpu.vbiosVersion, - pciBus: gpu.pciBus, - vramType: gpu.vramType, - vramBitWidth: gpu.vramBitWidth, - pcieWidth: gpu.pcieWidth ?? null, - pcieSpeed: gpu.pcieSpeed ?? null, - eccCorrectable: gpu.eccCorrectable ?? null, - eccUncorrectable: gpu.eccUncorrectable ?? null, - isThrottling: gpu.isThrottling ?? false, - })), + gpus: enhancedGpus, rocmDetected: true, rocmRuntimeVersion: rocData.runtimeVersion || "", }; @@ -315,6 +347,8 @@ export async function GET(): Promise { arch: osInfo.arch || "Unknown", }; + const amdSmiAvail = await isAmdSmiAvailable(); + const response: SystemMetrics = { timestamp: Date.now(), cpu: cpuMetrics, @@ -325,6 +359,7 @@ export async function GET(): Promise { os: osMetrics, rocmDetected: gpuData.rocmDetected, rocmRuntimeVersion: gpuData.rocmRuntimeVersion, + amdSmiAvailable: amdSmiAvail, }; return NextResponse.json(response, { headers: corsHeaders }); diff --git a/lib/components/GpuEccPanel.tsx b/lib/components/GpuEccPanel.tsx new file mode 100644 index 0000000..e511b8e --- /dev/null +++ b/lib/components/GpuEccPanel.tsx @@ -0,0 +1,90 @@ +"use client"; + +import type { GpuEcc } from "@/types/metrics"; + +interface GpuEccPanelProps { + ecc?: GpuEcc; +} + +export default function GpuEccPanel({ ecc }: GpuEccPanelProps) { + if (!ecc) { + return ( +
+
+
+ + + +
+
+

+ ECC Status +

+
+
+

+ ECC data not available +

+
+ ); + } + + const { correctable, uncorrectable } = ecc; + const hasErrors = correctable > 0 || uncorrectable > 0; + + return ( +
+
+
+ + + +
+
+

+ ECC Status +

+ {hasErrors ? ( +

Errors Detected

+ ) : ( +

No Errors

+ )} +
+
+ +
+ {/* Correctable Errors */} +
+
+ Correctable + 0 ? "#eab308" : "var(--foreground)" }} + > + {correctable} + +
+
+ + {/* Uncorrectable Errors */} +
+
+ Uncorrectable + 0 ? "#ef4444" : "var(--foreground)" }} + > + {uncorrectable} + +
+
+
+
+ ); +} diff --git a/lib/components/GpuEnginesPanel.tsx b/lib/components/GpuEnginesPanel.tsx new file mode 100644 index 0000000..2d95db0 --- /dev/null +++ b/lib/components/GpuEnginesPanel.tsx @@ -0,0 +1,101 @@ +"use client"; + +import type { GpuEngineUtilization } from "@/types/metrics"; + +interface GpuEnginesPanelProps { + engines: GpuEngineUtilization; +} + +export default function GpuEnginesPanel({ engines }: GpuEnginesPanelProps) { + const { gfx, mem, mm } = engines; + + // Determine workload type based on engine utilization + const getWorkloadType = () => { + if (gfx > 70 && mem < 40) return { label: "Compute-Bound", color: "text-blue-400" }; + if (mem > 70 && gfx < 40) return { label: "Memory-Bound", color: "text-yellow-400" }; + if (mm > 50) return { label: "Video Workload", color: "text-purple-400" }; + if (gfx > 70 && mem > 70) return { label: "Balanced Load", color: "text-green-400" }; + return { label: "Light Load", color: "text-muted-foreground" }; + }; + + const workload = getWorkloadType(); + + return ( +
+
+
+ + + +
+
+

+ GPU Engines +

+

+ {workload.label} +

+
+
+ +
+ {/* GFX Engine */} +
+
+ GFX (Compute) + 80 ? "#ef4444" : "var(--foreground)" }}> + {Math.round(gfx)}% + +
+
+
80 ? "#ef4444" : gfx > 50 ? "var(--primary)" : "#60a5fa", + }} + /> +
+
+ + {/* MEM Engine */} +
+
+ MEM (Memory) + 80 ? "#ef4444" : "var(--foreground)" }}> + {Math.round(mem)}% + +
+
+
80 ? "#ef4444" : mem > 50 ? "#fbbf24" : "#60a5fa", + }} + /> +
+
+ + {/* MM Engine */} +
+
+ MM (Multimedia) + + {Math.round(mm)}% + +
+
+
50 ? "#a855f7" : "#60a5fa", + }} + /> +
+
+
+
+ ); +} diff --git a/lib/components/GpuMediaPanel.tsx b/lib/components/GpuMediaPanel.tsx new file mode 100644 index 0000000..b097f41 --- /dev/null +++ b/lib/components/GpuMediaPanel.tsx @@ -0,0 +1,77 @@ +"use client"; + +import type { GpuMedia } from "@/types/metrics"; + +interface GpuMediaPanelProps { + media: GpuMedia; +} + +export default function GpuMediaPanel({ media }: GpuMediaPanelProps) { + const { encoder, decoder } = media; + + const formatPercent = (val: number | null) => { + if (val === null) return "N/A"; + return `${Math.round(val)}%`; + }; + + return ( +
+
+
+ + + +
+
+

+ Media Engines +

+
+
+ +
+ {/* Encoder */} +
+
+ Encoder + + {formatPercent(encoder)} + +
+ {encoder !== null && ( +
+
70 ? "#a855f7" : "#60a5fa", + }} + /> +
+ )} +
+ + {/* Decoder */} +
+
+ Decoder + + {formatPercent(decoder)} + +
+ {decoder !== null && ( +
+
70 ? "#a855f7" : "#60a5fa", + }} + /> +
+ )} +
+
+
+ ); +} diff --git a/lib/components/GpuPciePanel.tsx b/lib/components/GpuPciePanel.tsx new file mode 100644 index 0000000..5bbe71f --- /dev/null +++ b/lib/components/GpuPciePanel.tsx @@ -0,0 +1,88 @@ +"use client"; + +import type { GpuPcie, GpuXgmi } from "@/types/metrics"; + +interface GpuPciePanelProps { + pcie?: GpuPcie; + xgmi?: GpuXgmi; +} + +export default function GpuPciePanel({ pcie, xgmi }: GpuPciePanelProps) { + const formatBandwidth = (bw: number | null) => { + if (bw === null) return "N/A"; + if (bw > 1000) return `${(bw / 1000).toFixed(1)} GB/s`; + return `${Math.round(bw)} MB/s`; + }; + + return ( +
+
+
+ + + +
+
+

+ PCIe & XGMI +

+
+
+ +
+ {/* PCIe Info */} + {pcie && ( +
+
+ PCIe Link + + x{pcie.width || "?"} {pcie.speed || "N/A"} + +
+ {pcie.bandwidth !== null && ( +
+ Bandwidth + + {formatBandwidth(pcie.bandwidth)} + +
+ )} + {pcie.replayErrors !== null && pcie.replayErrors > 0 && ( +
+ Replay Errors + {pcie.replayErrors} +
+ )} +
+ )} + + {/* XGMI Info */} + {xgmi && ( +
+
+ XGMI Link + + {xgmi.linkStatus || "Inactive"} + +
+ {xgmi.bandwidth !== null && ( +
+ Bandwidth + + {formatBandwidth(xgmi.bandwidth)} + +
+ )} +
+ )} + + {/* No data state */} + {!pcie && !xgmi && ( +

+ PCIe/XGMI data not available +

+ )} +
+
+ ); +} diff --git a/lib/components/GpuPowerPanel.tsx b/lib/components/GpuPowerPanel.tsx new file mode 100644 index 0000000..dc8862c --- /dev/null +++ b/lib/components/GpuPowerPanel.tsx @@ -0,0 +1,70 @@ +"use client"; + +import type { GpuPower } from "@/types/metrics"; + +interface GpuPowerPanelProps { + power: GpuPower; +} + +export default function GpuPowerPanel({ power }: GpuPowerPanelProps) { + const { instant, average, voltage } = power; + + const formatPower = (w: number | null) => { + if (w === null) return "N/A"; + return `${w.toFixed(1)}W`; + }; + + const formatVoltage = (v: number | null) => { + if (v === null) return "N/A"; + return `${v.toFixed(2)}V`; + }; + + return ( +
+
+
+ + + +
+
+

+ Power Delivery +

+
+
+ +
+ {/* Instant Power */} +
+

+ Instant +

+

150 ? "#ef4444" : "var(--foreground)" }}> + {formatPower(instant)} +

+
+ + {/* Average Power */} +
+

+ Average +

+

+ {formatPower(average)} +

+
+ + {/* Voltage */} +
+

+ Voltage +

+

+ {formatVoltage(voltage)} +

+
+
+
+ ); +} diff --git a/lib/components/GpuTab.tsx b/lib/components/GpuTab.tsx index aacdcc6..661616b 100644 --- a/lib/components/GpuTab.tsx +++ b/lib/components/GpuTab.tsx @@ -9,6 +9,12 @@ import { AlertTriangle, Activity, } from "lucide-react"; +import GpuEnginesPanel from "./GpuEnginesPanel"; +import GpuThermalPanel from "./GpuThermalPanel"; +import GpuPowerPanel from "./GpuPowerPanel"; +import GpuMediaPanel from "./GpuMediaPanel"; +import GpuPciePanel from "./GpuPciePanel"; +import GpuEccPanel from "./GpuEccPanel"; function formatGB(gb: number) { return `${gb.toFixed(1)} GB`; @@ -411,6 +417,46 @@ export default function GpuTab({ gpus }: GpuTabProps) {
)} + + {/* Advanced Metrics Panels */} + {(() => { + const hasAdvancedMetrics = + primaryGpu.engineUtilization || + primaryGpu.thermal || + primaryGpu.powerMetrics || + primaryGpu.mediaEngines || + primaryGpu.pcieMetrics || + primaryGpu.xgmiMetrics || + primaryGpu.eccMetrics; + + if (!hasAdvancedMetrics) return null; + + return ( +
+

+ AMD Advanced Metrics +

+
+ {primaryGpu.engineUtilization && ( + + )} + {primaryGpu.thermal && ( + + )} + {primaryGpu.powerMetrics && ( + + )} + {primaryGpu.mediaEngines && ( + + )} + {(primaryGpu.pcieMetrics || primaryGpu.xgmiMetrics) && ( + + )} + +
+
+ ); + })()}
{/* Additional GPUs */} diff --git a/lib/components/GpuThermalPanel.tsx b/lib/components/GpuThermalPanel.tsx new file mode 100644 index 0000000..913e42f --- /dev/null +++ b/lib/components/GpuThermalPanel.tsx @@ -0,0 +1,78 @@ +"use client"; + +import type { GpuThermal } from "@/types/metrics"; + +interface GpuThermalPanelProps { + thermal: GpuThermal; +} + +export default function GpuThermalPanel({ thermal }: GpuThermalPanelProps) { + const { edge, junction, memory, isThrottling, throttleReason } = thermal; + + const formatTemp = (temp: number | null) => { + if (temp === null) return "N/A"; + return `${Math.round(temp)}°C`; + }; + + const getTempColor = (temp: number | null, threshold: number) => { + if (temp === null) return "text-muted-foreground"; + if (temp > threshold) return "text-red-500"; + if (temp > threshold * 0.85) return "text-yellow-500"; + return "text-green-500"; + }; + + return ( +
+
+
+ + + +
+
+

+ Thermal Sensors +

+ {isThrottling && ( +

+ ⚠️ Throttling Active + {throttleReason && ({throttleReason})} +

+ )} +
+
+ +
+ {/* Edge Temp */} +
+

+ Edge +

+

+ {formatTemp(edge)} +

+
+ + {/* Junction Temp */} +
+

+ Junction +

+

+ {formatTemp(junction)} +

+
+ + {/* Memory Temp */} +
+

+ Memory +

+

+ {formatTemp(memory)} +

+
+
+
+ ); +} diff --git a/lib/system/rocm.ts b/lib/system/rocm.ts index bb74240..1f581bd 100644 --- a/lib/system/rocm.ts +++ b/lib/system/rocm.ts @@ -59,6 +59,7 @@ interface ROCmSystemInfo { detected: boolean; rocmInfoPath?: string; rocmSmiPath?: string; + amdSmiAvailable?: boolean; } /** @@ -1068,6 +1069,7 @@ export async function detectROCm(): Promise { gpus, rocmInfoPath, rocmSmiPath, + amdSmiAvailable: true, }; } catch (error) { console.error("Failed to detect ROCm:", error); @@ -1075,7 +1077,171 @@ export async function detectROCm(): Promise { detected: false, runtimeVersion: "", gpus: [], + amdSmiAvailable: false, }; } } +// Advanced AMD GPU Metrics Functions + +export interface AmdGpuEngineMetrics { + gfx: number; + mem: number; + mm: number; +} + +export interface AmdGpuThermalMetrics { + edge: number | null; + junction: number | null; + memory: number | null; + isThrottling: boolean; + throttleReason?: string; +} + +export interface AmdGpuPowerMetrics { + instant: number | null; + average: number | null; + voltage: number | null; +} + +export interface AmdGpuClockMetrics { + sclk: number | null; + mclk: number | null; +} + +export interface AmdGpuPcieMetrics { + width: number | null; + speed: string | null; + bandwidth: number | null; + replayErrors: number | null; +} + +export interface AmdGpuXgmiMetrics { + bandwidth: number | null; + linkStatus: string | null; +} + +export interface AmdGpuMediaMetrics { + encoder: number | null; + decoder: number | null; +} + +export interface AmdGpuEccMetrics { + correctable: number; + uncorrectable: number; +} + +export interface AdvancedAmdGpuMetrics { + engineMetrics?: AmdGpuEngineMetrics; + thermalMetrics?: AmdGpuThermalMetrics; + powerMetrics?: AmdGpuPowerMetrics; + clockMetrics?: AmdGpuClockMetrics; + pcieMetrics?: AmdGpuPcieMetrics; + xgmiMetrics?: AmdGpuXgmiMetrics; + mediaMetrics?: AmdGpuMediaMetrics; + eccMetrics?: AmdGpuEccMetrics; +} + +/** + * Get advanced AMD GPU metrics from amd-smi + */ +export async function getAdvancedAmdGpuMetrics(gpuIndex: number): Promise { + const metrics: AdvancedAmdGpuMetrics = {}; + + try { + const { stdout } = await execAsync(`amd-smi metric -g ${gpuIndex} --json 2>/dev/null`); + const data = JSON.parse(stdout); + const gpuData = data.gpu_data?.[0]; + + if (!gpuData) return metrics; + + // Engine utilization + if (gpuData.engines) { + metrics.engineMetrics = { + gfx: parseFloat(gpuData.engines.gfx?.value) || 0, + mem: parseFloat(gpuData.engines.mem?.value) || 0, + mm: parseFloat(gpuData.engines.mm?.value) || 0, + }; + } + + // Thermal metrics + if (gpuData.temperature) { + metrics.thermalMetrics = { + edge: gpuData.temperature.edge?.value ? parseFloat(gpuData.temperature.edge.value) : null, + junction: gpuData.temperature.hotspot ? parseFloat(gpuData.temperature.hotspot) : null, + memory: gpuData.temperature.mem ? parseFloat(gpuData.temperature.mem) : null, + isThrottling: gpuData.throttle?.status === 'yes', + throttleReason: gpuData.throttle?.reason || undefined, + }; + } + + // Power metrics + if (gpuData.power) { + metrics.powerMetrics = { + instant: parseFloat(gpuData.power.socket_power) || null, + average: parseFloat(gpuData.power.average_socket_power) || null, + voltage: parseFloat(gpuData.power.voltage) || null, + }; + } + + // Clock metrics + if (gpuData.clock) { + metrics.clockMetrics = { + sclk: parseInt(gpuData.clock.sclk?.value) || null, + mclk: parseInt(gpuData.clock.mclk?.value) || null, + }; + } + + // PCIe metrics + if (gpuData.pcie) { + metrics.pcieMetrics = { + width: parseInt(gpuData.pcie.width) || null, + speed: gpuData.pcie.speed || null, + bandwidth: parseFloat(gpuData.pcie.bandwidth) || null, + replayErrors: parseInt(gpuData.pcie.replay_errors) || null, + }; + } + + // XGMI metrics (for multi-GPU) + if (gpuData.xgmi) { + metrics.xgmiMetrics = { + bandwidth: parseFloat(gpuData.xgmi.bandwidth) || null, + linkStatus: gpuData.xgmi.link_status || null, + }; + } + + // Media engines + if (gpuData.engines) { + metrics.mediaMetrics = { + encoder: parseFloat(gpuData.engines.encoder?.value) || null, + decoder: parseFloat(gpuData.engines.decoder?.value) || null, + }; + } + + // ECC metrics + if (gpuData.ecc) { + metrics.eccMetrics = { + correctable: parseInt(gpuData.ecc.correctable_count) || 0, + uncorrectable: parseInt(gpuData.ecc.uncorrectable_count) || 0, + }; + } + + } catch (error) { + console.warn(`Advanced AMD metrics not available for GPU ${gpuIndex}:`, error); + } + + return metrics; +} + +/** + * Check if AMD SMI is available for advanced metrics + */ +export async function isAmdSmiAvailable(): Promise { + try { + await execAsync('which amd-smi'); + return true; + } catch { + return false; + } +} + diff --git a/types/metrics.ts b/types/metrics.ts index a42ebb4..eaab58b 100644 --- a/types/metrics.ts +++ b/types/metrics.ts @@ -41,6 +41,61 @@ export interface MemoryMetrics { swapFree: number; } +// AMD GPU Engine Utilization (GFX, MEM, MM) +export interface GpuEngineUtilization { + gfx: number; // Compute/graphics cores + mem: number; // Memory controller + mm: number; // Multimedia engines +} + +// AMD GPU Thermal sensors +export interface GpuThermal { + edge: number | null; + junction: number | null; + memory: number | null; + isThrottling: boolean; + throttleReason?: string; +} + +// AMD GPU Power metrics +export interface GpuPower { + instant: number | null; + average: number | null; + voltage: number | null; +} + +// AMD GPU Clock speeds +export interface GpuClocks { + sclk: number | null; // Core clock + mclk: number | null; // Memory clock +} + +// AMD PCIe metrics +export interface GpuPcie { + width: number | null; + speed: string | null; + bandwidth: number | null; // MB/s + replayErrors: number | null; +} + +// AMD XGMI metrics (multi-GPU interconnect) +export interface GpuXgmi { + bandwidth: number | null; // MB/s + linkStatus: string | null; +} + +// AMD Media Engine metrics +export interface GpuMedia { + encoder: number | null; // % utilization + decoder: number | null; // % utilization +} + +// AMD ECC metrics +export interface GpuEcc { + correctable: number; + uncorrectable: number; +} + export interface GpuMetrics { index: number; name: string; @@ -50,6 +105,7 @@ export interface GpuMetrics { memory: { total: number; used: number; + utilization?: number | null; }; gttMemory?: { total: number; @@ -75,6 +131,16 @@ export interface GpuMetrics { eccCorrectable?: number | null; eccUncorrectable?: number | null; isThrottling?: boolean; + + // New advanced AMD metrics + engineUtilization?: GpuEngineUtilization; + thermal?: GpuThermal; + powerMetrics?: GpuPower; + clocks?: GpuClocks; + pcieMetrics?: GpuPcie; + xgmiMetrics?: GpuXgmi; + mediaEngines?: GpuMedia; + eccMetrics?: GpuEcc; } export interface NetworkMetrics { @@ -125,4 +191,5 @@ export interface SystemMetrics { }; rocmDetected: boolean; rocmRuntimeVersion: string; + amdSmiAvailable?: boolean; }