Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
__pycache__/*
tools/__pycache__/*
219 changes: 174 additions & 45 deletions __init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
import sys
import re
import json
import time
import subprocess
Expand All @@ -14,9 +15,15 @@
"vram_total": 0,
"vram_used_percent": 0,
"gpu_temperature": 0,
"gtt_used": 0,
"gtt_total": 0,
"gtt_used_percent": 0,
"last_update": 0
}

# For CPU percent fallback calculation (from /proc/stat)
cpu_prev = None

# Monitor thread control
monitor_thread = None
thread_control = threading.Event()
Expand Down Expand Up @@ -81,61 +88,163 @@ def get_gpu_info(rocm_smi_path):
"""Get current GPU information"""
global gpu_stats

# Get GPU utilization
try:
info = run_rocm_smi_command(rocm_smi_path, '--showuse', '--json')
if isinstance(info, dict) and 'card0' in info:
card_info = info['card0'] # Use first GPU
if 'GPU use (%)' in card_info:
gpu_use = card_info['GPU use (%)']
if isinstance(gpu_use, str):
gpu_use = gpu_use.replace('%', '')
gpu_stats["gpu_utilization"] = int(float(gpu_use))
except:
pass

# Get VRAM information
# Query rocm-smi once for multiple JSON fields to reduce subprocess overhead
try:
info = run_rocm_smi_command(rocm_smi_path, '--showmeminfo', 'vram', '--json')
# Request use, vram, and gtt meminfo plus temps in one JSON call
info = run_rocm_smi_command(rocm_smi_path, '--showuse', '--showmeminfo', 'vram', 'gtt', '--showtemp', '--json')
if isinstance(info, dict) and 'card0' in info:
card_info = info['card0'] # Use first GPU

# Parse the B (bytes) format ROCm 5.x/6.x uses
if 'VRAM Total Memory (B)' in card_info and 'VRAM Total Used Memory (B)' in card_info:
vram_total_bytes = int(card_info['VRAM Total Memory (B)'])
vram_used_bytes = int(card_info['VRAM Total Used Memory (B)'])

# Convert to MB for display
vram_total = vram_total_bytes / (1024 * 1024)
vram_used = vram_used_bytes / (1024 * 1024)

gpu_stats["vram_total"] = int(vram_total)
gpu_stats["vram_used"] = int(vram_used)
gpu_stats["vram_used_percent"] = int((vram_used / vram_total) * 100)
card_info = info['card0']
# GPU utilization
try:
if 'GPU use (%)' in card_info:
gpu_use = card_info['GPU use (%)']
if isinstance(gpu_use, str):
gpu_use = gpu_use.replace('%', '')
gpu_stats["gpu_utilization"] = int(float(gpu_use))
except:
pass
# VRAM (bytes keys used by ROCm)
try:
if 'VRAM Total Memory (B)' in card_info and 'VRAM Total Used Memory (B)' in card_info:
vram_total_bytes = int(card_info['VRAM Total Memory (B)'])
vram_used_bytes = int(card_info['VRAM Total Used Memory (B)'])
vram_total = vram_total_bytes / (1024 * 1024)
vram_used = vram_used_bytes / (1024 * 1024)
gpu_stats["vram_total"] = int(vram_total)
gpu_stats["vram_used"] = int(vram_used)
gpu_stats["vram_used_percent"] = int((vram_used / vram_total) * 100) if vram_total > 0 else 0
except:
pass
# Temperature
try:
if 'Temperature (Sensor edge) (C)' in card_info:
temp_str = card_info['Temperature (Sensor edge) (C)']
if isinstance(temp_str, str):
temp_str = temp_str.replace('°C', '').strip()
gpu_stats["gpu_temperature"] = int(float(temp_str))
elif 'Temperature (Sensor junction) (C)' in card_info:
temp_str = card_info['Temperature (Sensor junction) (C)']
if isinstance(temp_str, str):
temp_str = temp_str.replace('°C', '').strip()
gpu_stats["gpu_temperature"] = int(float(temp_str))
except:
pass
except:
pass

# Get temperature
# GTT information: try to read from the same JSON response first
gtt_parsed = False
try:
info = run_rocm_smi_command(rocm_smi_path, '--showtemp', '--json')
if isinstance(info, dict) and 'card0' in info:
card_info = info['card0'] # Use first GPU

# Try different temperature sensors, starting with edge
if 'Temperature (Sensor edge) (C)' in card_info:
temp_str = card_info['Temperature (Sensor edge) (C)']
if isinstance(temp_str, str):
temp_str = temp_str.replace('°C', '').strip()
gpu_stats["gpu_temperature"] = int(float(temp_str))
elif 'Temperature (Sensor junction) (C)' in card_info:
temp_str = card_info['Temperature (Sensor junction) (C)']
if isinstance(temp_str, str):
temp_str = temp_str.replace('°C', '').strip()
gpu_stats["gpu_temperature"] = int(float(temp_str))
card_info = info['card0']
if 'GTT Total Memory (B)' in card_info and 'GTT Total Used Memory (B)' in card_info:
gtt_total_bytes = int(card_info['GTT Total Memory (B)'])
gtt_used_bytes = int(card_info['GTT Total Used Memory (B)'])
gtt_total = gtt_total_bytes / (1024 * 1024)
gtt_used = gtt_used_bytes / (1024 * 1024)
gpu_stats['gtt_total'] = int(gtt_total)
gpu_stats['gtt_used'] = int(gtt_used)
gpu_stats['gtt_used_percent'] = int((gtt_used / gtt_total) * 100) if gtt_total > 0 else 0
gtt_parsed = True
except:
pass
if not gtt_parsed:
try:
info_text = run_rocm_smi_command(rocm_smi_path, '--showmeminfo', 'gtt')
if isinstance(info_text, str) and info_text:
m_total = re.search(r"GTT.*Total.*?(\d+[\.,]?\d*)\s*(MiB|GiB|MB|GB|B)?", info_text, re.I)
m_used = re.search(r"GTT.*Used.*?(\d+[\.,]?\d*)\s*(MiB|GiB|MB|GB|B)?", info_text, re.I)
if m_total:
total_val = float(m_total.group(1).replace(',', '.'))
unit = (m_total.group(2) or 'MB').lower()
if 'g' in unit:
total_mb = total_val * 1024
elif 'b' == unit:
total_mb = total_val / (1024*1024)
else:
total_mb = total_val
gpu_stats['gtt_total'] = int(total_mb)
if m_used:
used_val = float(m_used.group(1).replace(',', '.'))
unit = (m_used.group(2) or 'MB').lower()
if 'g' in unit:
used_mb = used_val * 1024
elif 'b' == unit:
used_mb = used_val / (1024*1024)
else:
used_mb = used_val
gpu_stats['gtt_used'] = int(used_mb)
if gpu_stats.get('gtt_total', 0) > 0:
gpu_stats['gtt_used_percent'] = int((gpu_stats['gtt_used'] / gpu_stats['gtt_total']) * 100)
except:
pass



gpu_stats["last_update"] = time.time()
# Update CPU and system RAM info
try:
# Prefer psutil if available
import psutil
try:
cpu_pct = psutil.cpu_percent(interval=None)
gpu_stats['cpu_utilization'] = int(round(cpu_pct))
except Exception:
pass

try:
vm = psutil.virtual_memory()
total_mb = int(vm.total / (1024 * 1024))
used_mb = int((vm.total - getattr(vm, 'available', vm.total - getattr(vm, 'used', 0))) / (1024 * 1024))
gpu_stats['system_ram_total'] = total_mb
gpu_stats['system_ram_used'] = used_mb
gpu_stats['system_ram_used_percent'] = int((used_mb / total_mb) * 100) if total_mb > 0 else 0
except Exception:
pass
except Exception:
# Fallbacks if psutil not available
# CPU percent via /proc/stat
global cpu_prev
try:
with open('/proc/stat', 'r') as f:
line = f.readline()
parts = line.split()
if parts[0] == 'cpu':
vals = list(map(int, parts[1:]))
idle = vals[3]
total = sum(vals)
if cpu_prev is not None:
prev_total, prev_idle = cpu_prev
total_diff = total - prev_total
idle_diff = idle - prev_idle
if total_diff > 0:
cpu_pct = (1.0 - (idle_diff / total_diff)) * 100.0
gpu_stats['cpu_utilization'] = int(round(cpu_pct))
cpu_prev = (total, idle)
except Exception:
pass

# System RAM total/available from /proc/meminfo (kB -> MB)
try:
meminfo = {}
with open('/proc/meminfo', 'r') as f:
for l in f:
parts = l.split()
if len(parts) >= 2:
key = parts[0].rstrip(':')
meminfo[key] = int(parts[1])
if 'MemTotal' in meminfo:
total_mb = int(meminfo['MemTotal'] / 1024)
avail_kb = meminfo.get('MemAvailable', None)
if avail_kb is None:
# Fallback: estimate available from MemFree + Buffers + Cached
avail_kb = meminfo.get('MemFree', 0) + meminfo.get('Buffers', 0) + meminfo.get('Cached', 0)
used_mb = int((meminfo['MemTotal'] - avail_kb) / 1024)
gpu_stats['system_ram_total'] = total_mb
gpu_stats['system_ram_used'] = used_mb
gpu_stats['system_ram_used_percent'] = int((used_mb / total_mb) * 100) if total_mb > 0 else 0
except Exception:
pass
return gpu_stats

def send_monitor_update():
Expand All @@ -147,9 +256,29 @@ def send_monitor_update():
'gpu_temperature': gpu_stats['gpu_temperature'],
'vram_total': gpu_stats['vram_total'],
'vram_used': gpu_stats['vram_used'],
'vram_used_percent': gpu_stats['vram_used_percent']
'vram_used_percent': gpu_stats['vram_used_percent'],
'gtt_total': gpu_stats.get('gtt_total', 0),
'gtt_used': gpu_stats.get('gtt_used', 0),
'gtt_used_percent': gpu_stats.get('gtt_used_percent', 0)
}]
}
# Include CPU and system RAM total (MB) at top level
try:
data['cpu_utilization'] = int(gpu_stats.get('cpu_utilization', 0))
except Exception:
data['cpu_utilization'] = 0
try:
data['system_ram_total'] = int(gpu_stats.get('system_ram_total', 0))
except Exception:
data['system_ram_total'] = 0
try:
data['system_ram_used'] = int(gpu_stats.get('system_ram_used', 0))
except Exception:
data['system_ram_used'] = 0
try:
data['system_ram_used_percent'] = int(gpu_stats.get('system_ram_used_percent', 0))
except Exception:
data['system_ram_used_percent'] = 0

# Send the data
try:
Expand Down
Loading