From 9468483fa65a9981f9fede202bb4e7b282db51f6 Mon Sep 17 00:00:00 2001 From: Stephen Shao Date: Wed, 13 May 2026 09:02:39 -0500 Subject: [PATCH 1/7] feat: make rocenv tool lite/full mode configurable via additional_context Allow users to collect full system environment details (including hardware_information, bios_settings, dmesg logs, amdgpu modinfo) by setting rocenv_mode to "full" in additional_context. Defaults to "lite" for backward compatibility. Usage: --additional-context "{'rocenv_mode': 'full'}" Co-Authored-By: Claude Opus 4 (1M context) --- src/madengine/deployment/k8s_scripts.py | 11 +++-- .../deployment/k8s_template_context.py | 3 +- src/madengine/execution/container_runner.py | 7 +++- .../common/pre_scripts/run_rocenv_tool.sh | 10 ++++- tests/unit/test_container_runner.py | 41 +++++++++++++++++++ tests/unit/test_k8s.py | 36 ++++++++++++++++ 6 files changed, 101 insertions(+), 7 deletions(-) diff --git a/src/madengine/deployment/k8s_scripts.py b/src/madengine/deployment/k8s_scripts.py index 2583c822..68e35c75 100644 --- a/src/madengine/deployment/k8s_scripts.py +++ b/src/madengine/deployment/k8s_scripts.py @@ -26,7 +26,7 @@ class KubernetesScriptsMixin: """Script and tool loading for Kubernetes ConfigMap embedding.""" def gather_system_env_details( - self, pre_scripts: List[Dict], model_name: str + self, pre_scripts: List[Dict], model_name: str, rocenv_mode: str = "lite" ) -> None: """ Gather system environment details by adding rocEnvTool to pre-scripts. @@ -36,13 +36,18 @@ def gather_system_env_details( Args: pre_scripts: List of pre-script configurations model_name: The model name (used for output file naming) + rocenv_mode: Collection mode - "lite" (default) or "full" """ + if rocenv_mode not in ("lite", "full"): + self.console.print(f"[yellow]Warning: Unknown rocenv_mode '{rocenv_mode}', defaulting to 'lite'[/yellow]") + rocenv_mode = "lite" + output_name = model_name.replace("/", "_") + "_env" pre_env_details = { "path": "scripts/common/pre_scripts/run_rocenv_tool.sh", - "args": model_name.replace("/", "_") + "_env" + "args": f"{output_name} {rocenv_mode}" } pre_scripts.append(pre_env_details) - self.console.print(f"[dim]Added rocEnvTool to pre-scripts with args: {pre_env_details['args']}[/dim]") + self.console.print(f"[dim]Added rocEnvTool (mode={rocenv_mode}) to pre-scripts with args: {pre_env_details['args']}[/dim]") def _add_tool_scripts(self, pre_scripts: List[Dict], post_scripts: List[Dict]) -> None: """ diff --git a/src/madengine/deployment/k8s_template_context.py b/src/madengine/deployment/k8s_template_context.py index 78c01072..4364af67 100644 --- a/src/madengine/deployment/k8s_template_context.py +++ b/src/madengine/deployment/k8s_template_context.py @@ -418,7 +418,8 @@ def _prepare_template_context( # This is controlled by generate_sys_env_details flag (default: True) generate_sys_env_details = self.config.additional_context.get("generate_sys_env_details", True) if generate_sys_env_details: - self.gather_system_env_details(pre_scripts, model_info["name"]) + rocenv_mode = self.config.additional_context.get("rocenv_mode", "lite") + self.gather_system_env_details(pre_scripts, model_info["name"], rocenv_mode=rocenv_mode) # Add tool pre/post scripts to the execution lists (like local execution) self._add_tool_scripts(pre_scripts, post_scripts) diff --git a/src/madengine/execution/container_runner.py b/src/madengine/execution/container_runner.py index 2ffc8a31..8bdbf449 100644 --- a/src/madengine/execution/container_runner.py +++ b/src/madengine/execution/container_runner.py @@ -809,7 +809,12 @@ def gather_system_env_details( # initialize pre_env_details pre_env_details = {} pre_env_details["path"] = "scripts/common/pre_scripts/run_rocenv_tool.sh" - pre_env_details["args"] = model_name.replace("/", "_") + "_env" + output_name = model_name.replace("/", "_") + "_env" + rocenv_mode = self.context.ctx.get("rocenv_mode", "lite") + if rocenv_mode not in ("lite", "full"): + print(f"Warning: Unknown rocenv_mode '{rocenv_mode}', defaulting to 'lite'") + rocenv_mode = "lite" + pre_env_details["args"] = f"{output_name} {rocenv_mode}" pre_encapsulate_post_scripts["pre_scripts"].append(pre_env_details) print(f"pre encap post scripts: {pre_encapsulate_post_scripts}") diff --git a/src/madengine/scripts/common/pre_scripts/run_rocenv_tool.sh b/src/madengine/scripts/common/pre_scripts/run_rocenv_tool.sh index 84879d05..a9c5dc38 100644 --- a/src/madengine/scripts/common/pre_scripts/run_rocenv_tool.sh +++ b/src/madengine/scripts/common/pre_scripts/run_rocenv_tool.sh @@ -5,6 +5,12 @@ # OUTPUT_FILE_NAME=${1:-"sys_config_info"} +ROCENV_MODE=${2:-"lite"} + +LITE_FLAG="--lite" +if [ "$ROCENV_MODE" = "full" ]; then + LITE_FLAG="" +fi # Determine the script's directory SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" @@ -13,7 +19,7 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" if [ -d "$SCRIPT_DIR/rocEnvTool" ]; then # K8s execution: rocEnvTool is already in place cd "$SCRIPT_DIR/rocEnvTool" - python3 rocenv_tool.py --lite --dump-csv --print-csv --output-name $OUTPUT_FILE_NAME + python3 rocenv_tool.py $LITE_FLAG --dump-csv --print-csv --output-name $OUTPUT_FILE_NAME out_dir="."$OUTPUT_FILE_NAME out_csv=$OUTPUT_FILE_NAME".csv" # Copy results back to workspace root @@ -27,7 +33,7 @@ else # Local execution: copy rocEnvTool from relative path cp -r ../scripts/common/pre_scripts/rocEnvTool . cd rocEnvTool - python3 rocenv_tool.py --lite --dump-csv --print-csv --output-name $OUTPUT_FILE_NAME + python3 rocenv_tool.py $LITE_FLAG --dump-csv --print-csv --output-name $OUTPUT_FILE_NAME out_dir="."$OUTPUT_FILE_NAME out_csv=$OUTPUT_FILE_NAME".csv" cp -r $out_dir ../../ diff --git a/tests/unit/test_container_runner.py b/tests/unit/test_container_runner.py index 2c10cbe7..70b48aa2 100644 --- a/tests/unit/test_container_runner.py +++ b/tests/unit/test_container_runner.py @@ -231,3 +231,44 @@ def test_setup_failure_appends_to_failed_runs_and_records_to_csv( call_kw = mock_update_perf_csv.call_args[1] assert call_kw.get("perf_csv") == perf_csv_path assert "exception_result" in call_kw + + +class TestGatherSystemEnvDetailsRocenvMode: + """gather_system_env_details passes rocenv_mode to run_rocenv_tool.sh args.""" + + def _make_runner(self, ctx_overrides=None): + ctx = MagicMock() + ctx.ctx = ctx_overrides or {} + return ContainerRunner(context=ctx, console=MagicMock()) + + def test_default_mode_is_lite(self): + """When rocenv_mode is absent, args should end with 'lite'.""" + runner = self._make_runner() + pep = {"pre_scripts": [], "encapsulate_script": "", "post_scripts": []} + runner.gather_system_env_details(pep, "my_model") + args = pep["pre_scripts"][0]["args"] + assert args == "my_model_env lite" + + def test_explicit_lite_mode(self): + """When rocenv_mode is 'lite', args should end with 'lite'.""" + runner = self._make_runner({"rocenv_mode": "lite"}) + pep = {"pre_scripts": [], "encapsulate_script": "", "post_scripts": []} + runner.gather_system_env_details(pep, "my_model") + args = pep["pre_scripts"][0]["args"] + assert args == "my_model_env lite" + + def test_full_mode(self): + """When rocenv_mode is 'full', args should end with 'full'.""" + runner = self._make_runner({"rocenv_mode": "full"}) + pep = {"pre_scripts": [], "encapsulate_script": "", "post_scripts": []} + runner.gather_system_env_details(pep, "org/my_model") + args = pep["pre_scripts"][0]["args"] + assert args == "org_my_model_env full" + + def test_invalid_mode_falls_back_to_lite(self): + """When rocenv_mode is invalid, should fall back to 'lite'.""" + runner = self._make_runner({"rocenv_mode": "invalid"}) + pep = {"pre_scripts": [], "encapsulate_script": "", "post_scripts": []} + runner.gather_system_env_details(pep, "my_model") + args = pep["pre_scripts"][0]["args"] + assert args == "my_model_env lite" diff --git a/tests/unit/test_k8s.py b/tests/unit/test_k8s.py index 3391b70c..fba607d9 100644 --- a/tests/unit/test_k8s.py +++ b/tests/unit/test_k8s.py @@ -230,3 +230,39 @@ def test_max_63_chars(self): long_hint = "a" * 200 c = sanitize_k8s_container_name(long_hint) assert len(c) <= 63 + + +class TestGatherSystemEnvDetailsK8sRocenvMode: + """K8s gather_system_env_details passes rocenv_mode to run_rocenv_tool.sh args.""" + + def _make_mixin(self): + from unittest.mock import MagicMock + from madengine.deployment.k8s_scripts import KubernetesScriptsMixin + + mixin = KubernetesScriptsMixin() + mixin.console = MagicMock() + return mixin + + def test_default_mode_is_lite(self): + mixin = self._make_mixin() + pre_scripts = [] + mixin.gather_system_env_details(pre_scripts, "my_model") + assert pre_scripts[0]["args"] == "my_model_env lite" + + def test_full_mode(self): + mixin = self._make_mixin() + pre_scripts = [] + mixin.gather_system_env_details(pre_scripts, "org/my_model", rocenv_mode="full") + assert pre_scripts[0]["args"] == "org_my_model_env full" + + def test_explicit_lite_mode(self): + mixin = self._make_mixin() + pre_scripts = [] + mixin.gather_system_env_details(pre_scripts, "my_model", rocenv_mode="lite") + assert pre_scripts[0]["args"] == "my_model_env lite" + + def test_invalid_mode_falls_back_to_lite(self): + mixin = self._make_mixin() + pre_scripts = [] + mixin.gather_system_env_details(pre_scripts, "my_model", rocenv_mode="bogus") + assert pre_scripts[0]["args"] == "my_model_env lite" From a7acf9899b25b0d321d820e5e921a2a02999882c Mon Sep 17 00:00:00 2001 From: Stephen Shao Date: Wed, 13 May 2026 09:15:42 -0500 Subject: [PATCH 2/7] test: add dummy_rocenv_full fixture with system diagnostic tools Add a dummy model whose Dockerfile installs lshw, dmidecode, kmod, and util-linux so rocenv_tool.py full mode can collect all 4 extra sections (hardware_information, bios_settings, dmsg_gpu_drm_atom_logs, amdgpu_modinfo) inside the container. Co-Authored-By: Claude Opus 4 (1M context) --- .../docker/dummy_rocenv_full.ubuntu.amd.Dockerfile | 12 ++++++++++++ tests/fixtures/dummy/models.json | 13 +++++++++++++ .../fixtures/dummy/scripts/dummy_rocenv_full/run.sh | 7 +++++++ 3 files changed, 32 insertions(+) create mode 100644 tests/fixtures/dummy/docker/dummy_rocenv_full.ubuntu.amd.Dockerfile create mode 100644 tests/fixtures/dummy/scripts/dummy_rocenv_full/run.sh diff --git a/tests/fixtures/dummy/docker/dummy_rocenv_full.ubuntu.amd.Dockerfile b/tests/fixtures/dummy/docker/dummy_rocenv_full.ubuntu.amd.Dockerfile new file mode 100644 index 00000000..019606dd --- /dev/null +++ b/tests/fixtures/dummy/docker/dummy_rocenv_full.ubuntu.amd.Dockerfile @@ -0,0 +1,12 @@ +# CONTEXT {'gpu_vendor': 'AMD', 'guest_os': 'UBUNTU'} +ARG BASE_DOCKER=rocm/pytorch +FROM $BASE_DOCKER + +# Install system diagnostic tools required by rocenv_tool.py full mode: +# lshw -> hardware_information +# dmidecode -> bios_settings +# kmod -> amdgpu_modinfo (provides modinfo) +# util-linux -> dmsg_gpu_drm_atom_logs (provides dmesg) +RUN apt-get update && \ + apt-get install -y --no-install-recommends lshw dmidecode kmod util-linux && \ + rm -rf /var/lib/apt/lists/* diff --git a/tests/fixtures/dummy/models.json b/tests/fixtures/dummy/models.json index 140779ab..8c8a29c1 100644 --- a/tests/fixtures/dummy/models.json +++ b/tests/fixtures/dummy/models.json @@ -439,6 +439,19 @@ ], "args": "" }, + { + "name": "dummy_rocenv_full", + "dockerfile": "docker/dummy_rocenv_full", + "scripts": "scripts/dummy_rocenv_full/run.sh", + "n_gpus": "1", + "owner": "mad.support@amd.com", + "training_precision": "", + "tags": [ + "dummies", + "dummy_rocenv_full" + ], + "args": "" + }, { "name": "dummy_therock", "dockerfile": "docker/dummy_therock", diff --git a/tests/fixtures/dummy/scripts/dummy_rocenv_full/run.sh b/tests/fixtures/dummy/scripts/dummy_rocenv_full/run.sh new file mode 100644 index 00000000..2c9893f7 --- /dev/null +++ b/tests/fixtures/dummy/scripts/dummy_rocenv_full/run.sh @@ -0,0 +1,7 @@ +#!/bin/bash +# +# Copyright (c) Advanced Micro Devices, Inc. +# All rights reserved. +# + +echo "performance: $RANDOM samples_per_second" From bb1f840293e037fefe3e6752df3e0aa258c5381e Mon Sep 17 00:00:00 2001 From: Stephen Shao Date: Wed, 13 May 2026 09:20:06 -0500 Subject: [PATCH 3/7] feat(rocenv): add CSV parser handlers for full-mode sections Add dump handlers for the 4 sections only collected in full mode: - hardware_information (lshw) - bios_settings (dmidecode) - dmsg_gpu_drm_atom_logs (dmesg) - amdgpu_modinfo (modinfo) Previously these sections were collected into .txt files but silently skipped by the CSV parser because it had no matching handlers. Co-Authored-By: Claude Opus 4 (1M context) --- .../pre_scripts/rocEnvTool/csv_parser.py | 77 +++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/src/madengine/scripts/common/pre_scripts/rocEnvTool/csv_parser.py b/src/madengine/scripts/common/pre_scripts/rocEnvTool/csv_parser.py index 7c1599ab..a94cc095 100644 --- a/src/madengine/scripts/common/pre_scripts/rocEnvTool/csv_parser.py +++ b/src/madengine/scripts/common/pre_scripts/rocEnvTool/csv_parser.py @@ -21,6 +21,10 @@ rocm_env_variables pip_list numa_balancing +hardware_information (full mode only - lshw) +bios_settings (full mode only - dmidecode) +dmsg_gpu_drm_atom_logs (full mode only - dmesg) +amdgpu_modinfo (full mode only - modinfo) ''' class CSVParser: def __init__(self, filename, sys_config_files_path, tags, path_resolver=None): @@ -217,6 +221,71 @@ def dump_numa_balancing_in_csv(self, log_path): info_list.append("Numa Balacing|" + lines[1].rstrip()) return info_list + def dump_hardware_information_in_csv(self, log_path): + """Parse lshw output, extracting key hardware fields.""" + lines = self.get_log_file_data(log_path) + info_list = [] + info_list.append(lines[0].rstrip()) + keywords = ("product:", "vendor:", "serial:", "width:", "size:", + "description:", "capabilities:", "clock:") + for j in range(1, len(lines)): + line = lines[j].rstrip() + if not line.strip(): + continue + line_lower = line.strip().lower() + for kw in keywords: + if line_lower.startswith(kw): + key, _, value = line.strip().partition(":") + info_list.append(key.strip() + "|" + value.strip()) + break + return info_list + + def dump_bios_settings_in_csv(self, log_path): + """Parse dmidecode output, extracting key:value pairs.""" + lines = self.get_log_file_data(log_path) + info_list = [] + info_list.append(lines[0].rstrip()) + for j in range(1, len(lines)): + line = lines[j].rstrip() + if not line.strip() or line.startswith("Handle ") or line.startswith("#"): + continue + if ":" in line and line[0] in (" ", "\t"): + key, _, value = line.strip().partition(":") + value = value.strip() + if value: + info_list.append(key.strip() + "|" + value) + return info_list + + def dump_dmsg_gpu_drm_atom_logs_in_csv(self, log_path): + """Parse dmesg filtered log lines.""" + lines = self.get_log_file_data(log_path) + info_list = [] + info_list.append(lines[0].rstrip()) + count = 0 + for j in range(1, len(lines)): + line = lines[j].rstrip() + if not line.strip(): + continue + info_list.append("Log|" + line.strip()) + count += 1 + if count >= 50: + break + return info_list + + def dump_amdgpu_modinfo_in_csv(self, log_path): + """Parse modinfo output (key:value per line, like lscpu).""" + lines = self.get_log_file_data(log_path) + info_list = [] + info_list.append(lines[0].rstrip()) + for j in range(1, len(lines)): + line = lines[j].rstrip() + if not line.strip(): + continue + if ":" in line: + key, _, value = line.partition(":") + info_list.append(key.strip() + "|" + value.strip()) + return info_list + def dump_cuda_information_in_csv(self, log_path): lines = self.get_log_file_data(log_path) info_list = [] @@ -291,6 +360,14 @@ def dump_csv_output(self): sys_config_info.extend(self.dump_pip_list_in_csv(log_path)) if tag == "numa_balancing": sys_config_info.extend(self.dump_numa_balancing_in_csv(log_path)) + if tag == "hardware_information": + sys_config_info.extend(self.dump_hardware_information_in_csv(log_path)) + if tag == "bios_settings": + sys_config_info.extend(self.dump_bios_settings_in_csv(log_path)) + if tag == "dmsg_gpu_drm_atom_logs": + sys_config_info.extend(self.dump_dmsg_gpu_drm_atom_logs_in_csv(log_path)) + if tag == "amdgpu_modinfo": + sys_config_info.extend(self.dump_amdgpu_modinfo_in_csv(log_path)) self.sys_config_info_list = sys_config_info From 6404540cfa39ced1ae49420af737bf5fb2666e6c Mon Sep 17 00:00:00 2001 From: Stephen Shao Date: Wed, 13 May 2026 09:32:29 -0500 Subject: [PATCH 4/7] feat(rocenv): install diagnostic tools on-demand in full mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When rocenv_mode is 'full', detect missing tools (lshw, dmidecode, kmod, dmesg) and attempt apt-get install before running rocenv_tool.py. Only installs what's actually missing, degrades gracefully if network or permissions are unavailable. This eliminates the need for a special Dockerfile to use full mode — any model container can collect all system diagnostic sections. Co-Authored-By: Claude Opus 4 (1M context) --- .../scripts/common/pre_scripts/run_rocenv_tool.sh | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/madengine/scripts/common/pre_scripts/run_rocenv_tool.sh b/src/madengine/scripts/common/pre_scripts/run_rocenv_tool.sh index a9c5dc38..af5c456c 100644 --- a/src/madengine/scripts/common/pre_scripts/run_rocenv_tool.sh +++ b/src/madengine/scripts/common/pre_scripts/run_rocenv_tool.sh @@ -10,6 +10,20 @@ ROCENV_MODE=${2:-"lite"} LITE_FLAG="--lite" if [ "$ROCENV_MODE" = "full" ]; then LITE_FLAG="" + # Install diagnostic tools on-demand if missing (best-effort) + # These are needed for hardware_information, bios_settings, + # dmsg_gpu_drm_atom_logs, and amdgpu_modinfo sections + MISSING_PKGS="" + command -v lshw >/dev/null 2>&1 || MISSING_PKGS="$MISSING_PKGS lshw" + command -v dmidecode >/dev/null 2>&1 || MISSING_PKGS="$MISSING_PKGS dmidecode" + command -v modinfo >/dev/null 2>&1 || MISSING_PKGS="$MISSING_PKGS kmod" + command -v dmesg >/dev/null 2>&1 || MISSING_PKGS="$MISSING_PKGS util-linux" + if [ -n "$MISSING_PKGS" ]; then + echo "rocenv full mode: installing missing diagnostic tools:$MISSING_PKGS" + apt-get update -qq >/dev/null 2>&1 && \ + apt-get install -y -qq --no-install-recommends $MISSING_PKGS >/dev/null 2>&1 || \ + echo "Warning: could not install some diagnostic tools (network or permissions issue)" + fi fi # Determine the script's directory From a0ef4578734666288e1eaedfd0c04a91605b4f99 Mon Sep 17 00:00:00 2001 From: Stephen Shao Date: Thu, 14 May 2026 22:23:22 -0500 Subject: [PATCH 5/7] fix(rocenv): install full-mode diagnostics by guest_os (Ubuntu vs CentOS) Wire MAD_GUEST_OS and a third run_rocenv_tool.sh argument from additional_context so full mode uses apt on UBUNTU and microdnf/dnf/yum on CENTOS instead of assuming Debian-based images everywhere. K8s pre-scripts pass the same guest_os. Co-authored-by: Cursor --- src/madengine/deployment/k8s_scripts.py | 13 ++++- .../deployment/k8s_template_context.py | 11 +++- src/madengine/execution/container_runner.py | 21 ++++++- .../common/pre_scripts/run_rocenv_tool.sh | 56 ++++++++++++++++--- tests/unit/test_container_runner.py | 33 ++++++++--- tests/unit/test_k8s.py | 16 ++++-- 6 files changed, 125 insertions(+), 25 deletions(-) diff --git a/src/madengine/deployment/k8s_scripts.py b/src/madengine/deployment/k8s_scripts.py index 68e35c75..277661e0 100644 --- a/src/madengine/deployment/k8s_scripts.py +++ b/src/madengine/deployment/k8s_scripts.py @@ -12,8 +12,9 @@ import json import re from pathlib import Path -from typing import Dict, List +from typing import Dict, List, Optional +from madengine.core.additional_context_defaults import DEFAULT_GUEST_OS from madengine.utils.path_utils import get_madengine_root from .primus_backend import ( @@ -26,7 +27,11 @@ class KubernetesScriptsMixin: """Script and tool loading for Kubernetes ConfigMap embedding.""" def gather_system_env_details( - self, pre_scripts: List[Dict], model_name: str, rocenv_mode: str = "lite" + self, + pre_scripts: List[Dict], + model_name: str, + rocenv_mode: str = "lite", + guest_os: Optional[str] = None, ) -> None: """ Gather system environment details by adding rocEnvTool to pre-scripts. @@ -37,14 +42,16 @@ def gather_system_env_details( pre_scripts: List of pre-script configurations model_name: The model name (used for output file naming) rocenv_mode: Collection mode - "lite" (default) or "full" + guest_os: UBUNTU / CENTOS (madengine additional_context); defaults to UBUNTU """ if rocenv_mode not in ("lite", "full"): self.console.print(f"[yellow]Warning: Unknown rocenv_mode '{rocenv_mode}', defaulting to 'lite'[/yellow]") rocenv_mode = "lite" + go = (guest_os or DEFAULT_GUEST_OS).strip().upper() or DEFAULT_GUEST_OS output_name = model_name.replace("/", "_") + "_env" pre_env_details = { "path": "scripts/common/pre_scripts/run_rocenv_tool.sh", - "args": f"{output_name} {rocenv_mode}" + "args": f"{output_name} {rocenv_mode} {go}", } pre_scripts.append(pre_env_details) self.console.print(f"[dim]Added rocEnvTool (mode={rocenv_mode}) to pre-scripts with args: {pre_env_details['args']}[/dim]") diff --git a/src/madengine/deployment/k8s_template_context.py b/src/madengine/deployment/k8s_template_context.py index 4364af67..bdb43d0e 100644 --- a/src/madengine/deployment/k8s_template_context.py +++ b/src/madengine/deployment/k8s_template_context.py @@ -28,6 +28,7 @@ infer_primus_backend_from_model_name, merged_primus_config, ) +from madengine.core.additional_context_defaults import DEFAULT_GUEST_OS from madengine.core.dataprovider import Data from madengine.core.errors import ConfigurationError from madengine.utils.gpu_config import resolve_runtime_gpus @@ -419,7 +420,15 @@ def _prepare_template_context( generate_sys_env_details = self.config.additional_context.get("generate_sys_env_details", True) if generate_sys_env_details: rocenv_mode = self.config.additional_context.get("rocenv_mode", "lite") - self.gather_system_env_details(pre_scripts, model_info["name"], rocenv_mode=rocenv_mode) + guest_os = str( + self.config.additional_context.get("guest_os") or DEFAULT_GUEST_OS + ).strip().upper() or DEFAULT_GUEST_OS + self.gather_system_env_details( + pre_scripts, + model_info["name"], + rocenv_mode=rocenv_mode, + guest_os=guest_os, + ) # Add tool pre/post scripts to the execution lists (like local execution) self._add_tool_scripts(pre_scripts, post_scripts) diff --git a/src/madengine/execution/container_runner.py b/src/madengine/execution/container_runner.py index 8bdbf449..991e14bb 100644 --- a/src/madengine/execution/container_runner.py +++ b/src/madengine/execution/container_runner.py @@ -34,6 +34,7 @@ from madengine.utils.config_parser import ConfigParser from madengine.utils.path_utils import scripts_base_dir_from from madengine.utils.run_details import get_build_number, get_pipeline +from madengine.core.additional_context_defaults import DEFAULT_GUEST_OS from madengine.utils.therock_markers import is_therock_tree from madengine.deployment.base import PERFORMANCE_LOG_PATTERN from madengine.execution.container_runner_helpers import ( @@ -793,6 +794,12 @@ def gather_system_env_details( ) -> None: """Gather system environment details. + Appends ``run_rocenv_tool.sh`` to pre_scripts with args: + `` `` (e.g. ``my_model_env lite UBUNTU``). + ``guest_os`` comes from ``docker_env_vars.MAD_GUEST_OS`` (if set) else + ``context.ctx['guest_os']``, defaulting to ``UBUNTU`` — aligned with + ``MAD_GUEST_OS`` injected for the container before this runs. + Args: pre_encapsulate_post_scripts: The pre, encapsulate and post scripts. model_name: The model name. @@ -814,7 +821,11 @@ def gather_system_env_details( if rocenv_mode not in ("lite", "full"): print(f"Warning: Unknown rocenv_mode '{rocenv_mode}', defaulting to 'lite'") rocenv_mode = "lite" - pre_env_details["args"] = f"{output_name} {rocenv_mode}" + dv = self.context.ctx.get("docker_env_vars") or {} + guest_os = str( + dv.get("MAD_GUEST_OS") or self.context.ctx.get("guest_os", DEFAULT_GUEST_OS) + ).strip().upper() + pre_env_details["args"] = f"{output_name} {rocenv_mode} {guest_os}" pre_encapsulate_post_scripts["pre_scripts"].append(pre_env_details) print(f"pre encap post scripts: {pre_encapsulate_post_scripts}") @@ -952,7 +963,7 @@ def run_container( # Also check shell environment for SLURM-passed variables if "docker_env_vars" not in self.context.ctx: self.context.ctx["docker_env_vars"] = {} - + # For SLURM jobs, check shell environment and populate additional_context with GPU info # This ensures GPU resolution works correctly if os.environ.get("MAD_DEPLOYMENT_TYPE") == "slurm": @@ -1014,6 +1025,12 @@ def run_container( if merged_count > 0: print(f"ℹ️ Merged {merged_count} environment variables from additional_context") + # rocEnvTool full-mode installs: align container with madengine guest_os (after docker_env_vars merge) + if "MAD_GUEST_OS" not in self.context.ctx["docker_env_vars"]: + self.context.ctx["docker_env_vars"]["MAD_GUEST_OS"] = str( + self.context.ctx.get("guest_os", DEFAULT_GUEST_OS) + ).strip().upper() + if self.context and str(self.context.ctx.get("gpu_vendor", "")).upper().find( "AMD" ) != -1: diff --git a/src/madengine/scripts/common/pre_scripts/run_rocenv_tool.sh b/src/madengine/scripts/common/pre_scripts/run_rocenv_tool.sh index af5c456c..61cf5f2a 100644 --- a/src/madengine/scripts/common/pre_scripts/run_rocenv_tool.sh +++ b/src/madengine/scripts/common/pre_scripts/run_rocenv_tool.sh @@ -6,6 +6,47 @@ OUTPUT_FILE_NAME=${1:-"sys_config_info"} ROCENV_MODE=${2:-"lite"} +# Third arg or MAD_GUEST_OS: must match madengine additional_context guest_os (UBUNTU, CENTOS, ...) +_GUEST_RAW=${3:-${MAD_GUEST_OS:-}} +if [ -z "${_GUEST_RAW}" ]; then + ROCENV_GUEST_OS="UBUNTU" +else + ROCENV_GUEST_OS=$(printf '%s' "${_GUEST_RAW}" | tr '[:lower:]' '[:upper:]') +fi + +# Best-effort install for rocenv "full" mode, keyed off madengine guest_os (not /etc/os-release). +rocenv_install_diagnostic_packages() { + local guest="$1" + local pkgs="$2" + + case "${guest}" in + UBUNTU) + if ! command -v apt-get >/dev/null 2>&1; then + echo "Warning: guest_os is UBUNTU but apt-get not found in this image; skipping package install for:${pkgs}" + return 1 + fi + apt-get update -qq >/dev/null 2>&1 && \ + apt-get install -y -qq --no-install-recommends ${pkgs} >/dev/null 2>&1 + ;; + CENTOS) + if command -v microdnf >/dev/null 2>&1; then + microdnf install -y -q ${pkgs} >/dev/null 2>&1 && return 0 + fi + if command -v dnf >/dev/null 2>&1; then + dnf install -y -q ${pkgs} >/dev/null 2>&1 && return 0 + fi + if command -v yum >/dev/null 2>&1; then + yum install -y -q ${pkgs} >/dev/null 2>&1 && return 0 + fi + echo "Warning: guest_os is CENTOS but no microdnf, dnf, or yum found; skipping package install for:${pkgs}" + return 1 + ;; + *) + echo "Warning: rocenv full mode auto-install is not implemented for guest_os=${guest} (supported: UBUNTU, CENTOS). Missing tools:${pkgs}" + return 1 + ;; + esac +} LITE_FLAG="--lite" if [ "$ROCENV_MODE" = "full" ]; then @@ -14,15 +55,14 @@ if [ "$ROCENV_MODE" = "full" ]; then # These are needed for hardware_information, bios_settings, # dmsg_gpu_drm_atom_logs, and amdgpu_modinfo sections MISSING_PKGS="" - command -v lshw >/dev/null 2>&1 || MISSING_PKGS="$MISSING_PKGS lshw" - command -v dmidecode >/dev/null 2>&1 || MISSING_PKGS="$MISSING_PKGS dmidecode" - command -v modinfo >/dev/null 2>&1 || MISSING_PKGS="$MISSING_PKGS kmod" - command -v dmesg >/dev/null 2>&1 || MISSING_PKGS="$MISSING_PKGS util-linux" + command -v lshw >/dev/null 2>&1 || MISSING_PKGS="${MISSING_PKGS:+$MISSING_PKGS }lshw" + command -v dmidecode >/dev/null 2>&1 || MISSING_PKGS="${MISSING_PKGS:+$MISSING_PKGS }dmidecode" + command -v modinfo >/dev/null 2>&1 || MISSING_PKGS="${MISSING_PKGS:+$MISSING_PKGS }kmod" + command -v dmesg >/dev/null 2>&1 || MISSING_PKGS="${MISSING_PKGS:+$MISSING_PKGS }util-linux" if [ -n "$MISSING_PKGS" ]; then - echo "rocenv full mode: installing missing diagnostic tools:$MISSING_PKGS" - apt-get update -qq >/dev/null 2>&1 && \ - apt-get install -y -qq --no-install-recommends $MISSING_PKGS >/dev/null 2>&1 || \ - echo "Warning: could not install some diagnostic tools (network or permissions issue)" + echo "rocenv full mode (guest_os=${ROCENV_GUEST_OS}): installing missing diagnostic tools: ${MISSING_PKGS}" + rocenv_install_diagnostic_packages "${ROCENV_GUEST_OS}" "${MISSING_PKGS}" || \ + echo "Warning: could not install some diagnostic tools (network, permissions, or unsupported guest_os)" fi fi diff --git a/tests/unit/test_container_runner.py b/tests/unit/test_container_runner.py index 70b48aa2..7eb7fe88 100644 --- a/tests/unit/test_container_runner.py +++ b/tests/unit/test_container_runner.py @@ -242,28 +242,47 @@ def _make_runner(self, ctx_overrides=None): return ContainerRunner(context=ctx, console=MagicMock()) def test_default_mode_is_lite(self): - """When rocenv_mode is absent, args should end with 'lite'.""" + """When rocenv_mode is absent, args should end with 'lite' and default guest_os UBUNTU.""" runner = self._make_runner() pep = {"pre_scripts": [], "encapsulate_script": "", "post_scripts": []} runner.gather_system_env_details(pep, "my_model") args = pep["pre_scripts"][0]["args"] - assert args == "my_model_env lite" + assert args == "my_model_env lite UBUNTU" def test_explicit_lite_mode(self): - """When rocenv_mode is 'lite', args should end with 'lite'.""" + """When rocenv_mode is 'lite', args should end with 'lite' and guest_os.""" runner = self._make_runner({"rocenv_mode": "lite"}) pep = {"pre_scripts": [], "encapsulate_script": "", "post_scripts": []} runner.gather_system_env_details(pep, "my_model") args = pep["pre_scripts"][0]["args"] - assert args == "my_model_env lite" + assert args == "my_model_env lite UBUNTU" def test_full_mode(self): - """When rocenv_mode is 'full', args should end with 'full'.""" + """When rocenv_mode is 'full', args should end with 'full' and guest_os.""" runner = self._make_runner({"rocenv_mode": "full"}) pep = {"pre_scripts": [], "encapsulate_script": "", "post_scripts": []} runner.gather_system_env_details(pep, "org/my_model") args = pep["pre_scripts"][0]["args"] - assert args == "org_my_model_env full" + assert args == "org_my_model_env full UBUNTU" + + def test_guest_os_centos(self): + """guest_os in context is passed as third arg (uppercased).""" + runner = self._make_runner({"rocenv_mode": "lite", "guest_os": "centos"}) + pep = {"pre_scripts": [], "encapsulate_script": "", "post_scripts": []} + runner.gather_system_env_details(pep, "my_model") + assert pep["pre_scripts"][0]["args"] == "my_model_env lite CENTOS" + + def test_mad_guest_os_overrides_guest_os(self): + """docker_env_vars MAD_GUEST_OS wins over top-level guest_os for script args.""" + runner = self._make_runner( + { + "guest_os": "UBUNTU", + "docker_env_vars": {"MAD_GUEST_OS": "CENTOS"}, + } + ) + pep = {"pre_scripts": [], "encapsulate_script": "", "post_scripts": []} + runner.gather_system_env_details(pep, "my_model") + assert pep["pre_scripts"][0]["args"] == "my_model_env lite CENTOS" def test_invalid_mode_falls_back_to_lite(self): """When rocenv_mode is invalid, should fall back to 'lite'.""" @@ -271,4 +290,4 @@ def test_invalid_mode_falls_back_to_lite(self): pep = {"pre_scripts": [], "encapsulate_script": "", "post_scripts": []} runner.gather_system_env_details(pep, "my_model") args = pep["pre_scripts"][0]["args"] - assert args == "my_model_env lite" + assert args == "my_model_env lite UBUNTU" diff --git a/tests/unit/test_k8s.py b/tests/unit/test_k8s.py index fba607d9..4a1f84a4 100644 --- a/tests/unit/test_k8s.py +++ b/tests/unit/test_k8s.py @@ -247,22 +247,30 @@ def test_default_mode_is_lite(self): mixin = self._make_mixin() pre_scripts = [] mixin.gather_system_env_details(pre_scripts, "my_model") - assert pre_scripts[0]["args"] == "my_model_env lite" + assert pre_scripts[0]["args"] == "my_model_env lite UBUNTU" def test_full_mode(self): mixin = self._make_mixin() pre_scripts = [] mixin.gather_system_env_details(pre_scripts, "org/my_model", rocenv_mode="full") - assert pre_scripts[0]["args"] == "org_my_model_env full" + assert pre_scripts[0]["args"] == "org_my_model_env full UBUNTU" def test_explicit_lite_mode(self): mixin = self._make_mixin() pre_scripts = [] mixin.gather_system_env_details(pre_scripts, "my_model", rocenv_mode="lite") - assert pre_scripts[0]["args"] == "my_model_env lite" + assert pre_scripts[0]["args"] == "my_model_env lite UBUNTU" + + def test_guest_os_centos(self): + mixin = self._make_mixin() + pre_scripts = [] + mixin.gather_system_env_details( + pre_scripts, "my_model", rocenv_mode="lite", guest_os="centos" + ) + assert pre_scripts[0]["args"] == "my_model_env lite CENTOS" def test_invalid_mode_falls_back_to_lite(self): mixin = self._make_mixin() pre_scripts = [] mixin.gather_system_env_details(pre_scripts, "my_model", rocenv_mode="bogus") - assert pre_scripts[0]["args"] == "my_model_env lite" + assert pre_scripts[0]["args"] == "my_model_env lite UBUNTU" From 010acc483cbe7a39da3e8e7f477837a52c458e77 Mon Sep 17 00:00:00 2001 From: Stephen Shao Date: Fri, 15 May 2026 13:37:22 -0500 Subject: [PATCH 6/7] fix(tests): stabilize RPD pre-script and RCCL profiling e2e Use apt-get/yum without sudo when root in trace.sh RPD setup, add build deps for rocmProfileData, and constrain dummy NCCL init (HIP/IB/socket) to avoid topology hangs. Relax rccl_trace log assertion for minor NCCL log format drift. Co-authored-by: Cursor --- .../scripts/common/pre_scripts/trace.sh | 32 ++++++++++++++++--- tests/e2e/test_profiling_workflows.py | 3 +- .../dummy/scripts/dummy/run_nccl_trace.sh | 32 +++++++++++++++---- 3 files changed, 55 insertions(+), 12 deletions(-) diff --git a/src/madengine/scripts/common/pre_scripts/trace.sh b/src/madengine/scripts/common/pre_scripts/trace.sh index 5c591c83..bdbfdbc7 100644 --- a/src/madengine/scripts/common/pre_scripts/trace.sh +++ b/src/madengine/scripts/common/pre_scripts/trace.sh @@ -13,20 +13,42 @@ case "$tool" in rpd) # OS packages only needed for RPD build; other tools (e.g. rocm_trace_lite) skip this. + # Docker madengine runs often use root with no sudo — use apt-get/yum directly when uid==0. os='' - if command -v apt >/dev/null 2>&1; then + if command -v apt-get >/dev/null 2>&1; then os=ubuntu elif command -v yum >/dev/null 2>&1; then os=centos else - echo 'Unable to detect Host OS in pre_script (need apt or yum for RPD dependencies)' >&2 + echo 'Unable to detect Host OS in pre_script (need apt-get or yum for RPD dependencies)' >&2 exit 1 fi if [ "$os" == 'ubuntu' ]; then - sudo apt update - sudo apt install -y sqlite3 libsqlite3-dev libfmt-dev python3-pip nlohmann-json3-dev + if [ "$(id -u)" -eq 0 ]; then + apt-get update -qq + DEBIAN_FRONTEND=noninteractive apt-get install -y -qq \ + sqlite3 libsqlite3-dev libfmt-dev python3-pip nlohmann-json3-dev \ + git build-essential pkg-config + elif command -v sudo >/dev/null 2>&1; then + sudo apt-get update -qq + sudo DEBIAN_FRONTEND=noninteractive apt-get install -y -qq \ + sqlite3 libsqlite3-dev libfmt-dev python3-pip nlohmann-json3-dev \ + git build-essential pkg-config + else + echo 'RPD pre-script: need root or sudo for apt-get' >&2 + exit 1 + fi elif [ "$os" == 'centos' ]; then - sudo yum install -y libsqlite3x-devel.x86_64 fmt-devel python3-pip json-devel + if [ "$(id -u)" -eq 0 ]; then + yum install -y gcc gcc-c++ make git \ + libsqlite3x-devel.x86_64 fmt-devel python3-pip json-devel + elif command -v sudo >/dev/null 2>&1; then + sudo yum install -y gcc gcc-c++ make git \ + libsqlite3x-devel.x86_64 fmt-devel python3-pip json-devel + else + echo 'RPD pre-script: need root or sudo for yum' >&2 + exit 1 + fi else echo "Unable to detect Host OS in trace pre-script" fi diff --git a/tests/e2e/test_profiling_workflows.py b/tests/e2e/test_profiling_workflows.py index 74925ae2..acc0b960 100644 --- a/tests/e2e/test_profiling_workflows.py +++ b/tests/e2e/test_profiling_workflows.py @@ -316,7 +316,8 @@ def test_rccl_trace_runs_correctly(self, global_data, clean_test_temp_files): canFail=False, ) - regexp = re.compile(r"NCCL INFO AllReduce:") + # RCCL log prefix/format varies slightly by RCCL build; keep assertion anchored on NCCL + AllReduce. + regexp = re.compile(r"NCCL\s+INFO\s+.*AllReduce", re.IGNORECASE) foundMatch = None with open( os.path.join( diff --git a/tests/fixtures/dummy/scripts/dummy/run_nccl_trace.sh b/tests/fixtures/dummy/scripts/dummy/run_nccl_trace.sh index da2a8798..b55df647 100644 --- a/tests/fixtures/dummy/scripts/dummy/run_nccl_trace.sh +++ b/tests/fixtures/dummy/scripts/dummy/run_nccl_trace.sh @@ -1,9 +1,29 @@ -#!/bin/bash -# +#!/usr/bin/env bash +# # Copyright (c) Advanced Micro Devices, Inc. # All rights reserved. -# +# +# Single-process NCCL sanity check for RCCL_TRACE e2e tests. On hosts where the +# container exposes fewer GPUs than the physical topology (e.g. MI300X multi-OAM), +# RCCL can spend a long time probing inaccessible ROCr/KFD nodes without these guards. -python -c "import torch; import torch.distributed as dist; import os; os.environ['MASTER_ADDR'] = 'localhost'; os.environ['MASTER_PORT'] = '29501'; dist.init_process_group('nccl', rank=0, world_size=1);tensor = torch.arange(1, dtype=torch.int64).cuda(); dist.all_reduce(tensor, op=dist.ReduceOp.SUM); print(tensor[0]); " | tee log.txt - -echo "performance: 1 pass" +export HIP_VISIBLE_DEVICES="${HIP_VISIBLE_DEVICES:-0}" +export NCCL_IB_DISABLE="${NCCL_IB_DISABLE:-1}" +export NCCL_SOCKET_IFNAME="${NCCL_SOCKET_IFNAME:-lo}" +export MASTER_ADDR="${MASTER_ADDR:-127.0.0.1}" +export MASTER_PORT="${MASTER_PORT:-29501}" + +python3 -c " +import os +import torch +import torch.distributed as dist + +os.environ.setdefault('MASTER_ADDR', '127.0.0.1') +os.environ.setdefault('MASTER_PORT', '29501') +dist.init_process_group('nccl', rank=0, world_size=1) +tensor = torch.arange(1, dtype=torch.int64).cuda() +dist.all_reduce(tensor, op=dist.ReduceOp.SUM) +print(tensor[0]) +" | tee log.txt + +echo "performance: 1 pass" From 22baace73c2acf0ceb70c5c73d6856e0781e1b82 Mon Sep 17 00:00:00 2001 From: Stephen Shao Date: Sat, 16 May 2026 04:28:43 +0000 Subject: [PATCH 7/7] fix(rpd): install xxd in trace.sh pre-script for rpd_tracer build Upstream rocmProfileData/rpd_tracer/Makefile uses `xxd -i` to embed tableSchema.cmd/utilitySchema.cmd as C arrays. The rocm/pytorch base image lacks xxd, so `make rpd` failed with exit 127 and the e2e test saw no trace.rpd. Add xxd on Ubuntu and vim-common (provides xxd) on CentOS. Co-Authored-By: Claude Opus 4 --- src/madengine/scripts/common/pre_scripts/trace.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/madengine/scripts/common/pre_scripts/trace.sh b/src/madengine/scripts/common/pre_scripts/trace.sh index bdbfdbc7..f6b8e624 100644 --- a/src/madengine/scripts/common/pre_scripts/trace.sh +++ b/src/madengine/scripts/common/pre_scripts/trace.sh @@ -28,12 +28,12 @@ rpd) apt-get update -qq DEBIAN_FRONTEND=noninteractive apt-get install -y -qq \ sqlite3 libsqlite3-dev libfmt-dev python3-pip nlohmann-json3-dev \ - git build-essential pkg-config + git build-essential pkg-config xxd elif command -v sudo >/dev/null 2>&1; then sudo apt-get update -qq sudo DEBIAN_FRONTEND=noninteractive apt-get install -y -qq \ sqlite3 libsqlite3-dev libfmt-dev python3-pip nlohmann-json3-dev \ - git build-essential pkg-config + git build-essential pkg-config xxd else echo 'RPD pre-script: need root or sudo for apt-get' >&2 exit 1 @@ -41,10 +41,10 @@ rpd) elif [ "$os" == 'centos' ]; then if [ "$(id -u)" -eq 0 ]; then yum install -y gcc gcc-c++ make git \ - libsqlite3x-devel.x86_64 fmt-devel python3-pip json-devel + libsqlite3x-devel.x86_64 fmt-devel python3-pip json-devel vim-common elif command -v sudo >/dev/null 2>&1; then sudo yum install -y gcc gcc-c++ make git \ - libsqlite3x-devel.x86_64 fmt-devel python3-pip json-devel + libsqlite3x-devel.x86_64 fmt-devel python3-pip json-devel vim-common else echo 'RPD pre-script: need root or sudo for yum' >&2 exit 1