Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 16 additions & 4 deletions src/madengine/deployment/k8s_scripts.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,9 @@
import json
import re
from pathlib import Path
from typing import Dict, List
from typing import Dict, List, Optional

from madengine.core.additional_context_defaults import DEFAULT_GUEST_OS
from madengine.utils.path_utils import get_madengine_root

from .primus_backend import (
Expand All @@ -26,7 +27,11 @@ class KubernetesScriptsMixin:
"""Script and tool loading for Kubernetes ConfigMap embedding."""

def gather_system_env_details(
self, pre_scripts: List[Dict], model_name: str
self,
pre_scripts: List[Dict],
model_name: str,
rocenv_mode: str = "lite",
guest_os: Optional[str] = None,
) -> None:
"""
Gather system environment details by adding rocEnvTool to pre-scripts.
Expand All @@ -36,13 +41,20 @@ def gather_system_env_details(
Args:
pre_scripts: List of pre-script configurations
model_name: The model name (used for output file naming)
rocenv_mode: Collection mode - "lite" (default) or "full"
guest_os: UBUNTU / CENTOS (madengine additional_context); defaults to UBUNTU
"""
if rocenv_mode not in ("lite", "full"):
self.console.print(f"[yellow]Warning: Unknown rocenv_mode '{rocenv_mode}', defaulting to 'lite'[/yellow]")
rocenv_mode = "lite"
go = (guest_os or DEFAULT_GUEST_OS).strip().upper() or DEFAULT_GUEST_OS
output_name = model_name.replace("/", "_") + "_env"
pre_env_details = {
"path": "scripts/common/pre_scripts/run_rocenv_tool.sh",
"args": model_name.replace("/", "_") + "_env"
"args": f"{output_name} {rocenv_mode} {go}",
}
pre_scripts.append(pre_env_details)
self.console.print(f"[dim]Added rocEnvTool to pre-scripts with args: {pre_env_details['args']}[/dim]")
self.console.print(f"[dim]Added rocEnvTool (mode={rocenv_mode}) to pre-scripts with args: {pre_env_details['args']}[/dim]")

def _add_tool_scripts(self, pre_scripts: List[Dict], post_scripts: List[Dict]) -> None:
"""
Expand Down
12 changes: 11 additions & 1 deletion src/madengine/deployment/k8s_template_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
infer_primus_backend_from_model_name,
merged_primus_config,
)
from madengine.core.additional_context_defaults import DEFAULT_GUEST_OS
from madengine.core.dataprovider import Data
from madengine.core.errors import ConfigurationError
from madengine.utils.gpu_config import resolve_runtime_gpus
Expand Down Expand Up @@ -418,7 +419,16 @@ def _prepare_template_context(
# This is controlled by generate_sys_env_details flag (default: True)
generate_sys_env_details = self.config.additional_context.get("generate_sys_env_details", True)
if generate_sys_env_details:
self.gather_system_env_details(pre_scripts, model_info["name"])
rocenv_mode = self.config.additional_context.get("rocenv_mode", "lite")
guest_os = str(
self.config.additional_context.get("guest_os") or DEFAULT_GUEST_OS
).strip().upper() or DEFAULT_GUEST_OS
self.gather_system_env_details(
pre_scripts,
model_info["name"],
rocenv_mode=rocenv_mode,
guest_os=guest_os,
)

# Add tool pre/post scripts to the execution lists (like local execution)
self._add_tool_scripts(pre_scripts, post_scripts)
Expand Down
26 changes: 24 additions & 2 deletions src/madengine/execution/container_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
from madengine.utils.config_parser import ConfigParser
from madengine.utils.path_utils import scripts_base_dir_from
from madengine.utils.run_details import get_build_number, get_pipeline
from madengine.core.additional_context_defaults import DEFAULT_GUEST_OS
from madengine.utils.therock_markers import is_therock_tree
from madengine.deployment.base import PERFORMANCE_LOG_PATTERN
from madengine.execution.container_runner_helpers import (
Expand Down Expand Up @@ -793,6 +794,12 @@ def gather_system_env_details(
) -> None:
"""Gather system environment details.

Appends ``run_rocenv_tool.sh`` to pre_scripts with args:
``<output_basename> <rocenv_mode> <guest_os>`` (e.g. ``my_model_env lite UBUNTU``).
``guest_os`` comes from ``docker_env_vars.MAD_GUEST_OS`` (if set) else
``context.ctx['guest_os']``, defaulting to ``UBUNTU`` — aligned with
``MAD_GUEST_OS`` injected for the container before this runs.

Args:
pre_encapsulate_post_scripts: The pre, encapsulate and post scripts.
model_name: The model name.
Expand All @@ -809,7 +816,16 @@ def gather_system_env_details(
# initialize pre_env_details
pre_env_details = {}
pre_env_details["path"] = "scripts/common/pre_scripts/run_rocenv_tool.sh"
pre_env_details["args"] = model_name.replace("/", "_") + "_env"
output_name = model_name.replace("/", "_") + "_env"
rocenv_mode = self.context.ctx.get("rocenv_mode", "lite")
if rocenv_mode not in ("lite", "full"):
print(f"Warning: Unknown rocenv_mode '{rocenv_mode}', defaulting to 'lite'")
rocenv_mode = "lite"
dv = self.context.ctx.get("docker_env_vars") or {}
guest_os = str(
dv.get("MAD_GUEST_OS") or self.context.ctx.get("guest_os", DEFAULT_GUEST_OS)
).strip().upper()
pre_env_details["args"] = f"{output_name} {rocenv_mode} {guest_os}"
pre_encapsulate_post_scripts["pre_scripts"].append(pre_env_details)
print(f"pre encap post scripts: {pre_encapsulate_post_scripts}")

Expand Down Expand Up @@ -947,7 +963,7 @@ def run_container(
# Also check shell environment for SLURM-passed variables
if "docker_env_vars" not in self.context.ctx:
self.context.ctx["docker_env_vars"] = {}

# For SLURM jobs, check shell environment and populate additional_context with GPU info
# This ensures GPU resolution works correctly
if os.environ.get("MAD_DEPLOYMENT_TYPE") == "slurm":
Expand Down Expand Up @@ -1009,6 +1025,12 @@ def run_container(
if merged_count > 0:
print(f"ℹ️ Merged {merged_count} environment variables from additional_context")

# rocEnvTool full-mode installs: align container with madengine guest_os (after docker_env_vars merge)
if "MAD_GUEST_OS" not in self.context.ctx["docker_env_vars"]:
self.context.ctx["docker_env_vars"]["MAD_GUEST_OS"] = str(
self.context.ctx.get("guest_os", DEFAULT_GUEST_OS)
).strip().upper()

if self.context and str(self.context.ctx.get("gpu_vendor", "")).upper().find(
"AMD"
) != -1:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@
rocm_env_variables
pip_list
numa_balancing
hardware_information (full mode only - lshw)
bios_settings (full mode only - dmidecode)
dmsg_gpu_drm_atom_logs (full mode only - dmesg)
amdgpu_modinfo (full mode only - modinfo)
'''
class CSVParser:
def __init__(self, filename, sys_config_files_path, tags, path_resolver=None):
Expand Down Expand Up @@ -217,6 +221,71 @@ def dump_numa_balancing_in_csv(self, log_path):
info_list.append("Numa Balacing|" + lines[1].rstrip())
return info_list

def dump_hardware_information_in_csv(self, log_path):
"""Parse lshw output, extracting key hardware fields."""
lines = self.get_log_file_data(log_path)
info_list = []
info_list.append(lines[0].rstrip())
keywords = ("product:", "vendor:", "serial:", "width:", "size:",
"description:", "capabilities:", "clock:")
for j in range(1, len(lines)):
line = lines[j].rstrip()
if not line.strip():
continue
line_lower = line.strip().lower()
for kw in keywords:
if line_lower.startswith(kw):
key, _, value = line.strip().partition(":")
info_list.append(key.strip() + "|" + value.strip())
break
return info_list

def dump_bios_settings_in_csv(self, log_path):
"""Parse dmidecode output, extracting key:value pairs."""
lines = self.get_log_file_data(log_path)
info_list = []
info_list.append(lines[0].rstrip())
for j in range(1, len(lines)):
line = lines[j].rstrip()
if not line.strip() or line.startswith("Handle ") or line.startswith("#"):
continue
if ":" in line and line[0] in (" ", "\t"):
key, _, value = line.strip().partition(":")
value = value.strip()
if value:
info_list.append(key.strip() + "|" + value)
return info_list

def dump_dmsg_gpu_drm_atom_logs_in_csv(self, log_path):
"""Parse dmesg filtered log lines."""
lines = self.get_log_file_data(log_path)
info_list = []
info_list.append(lines[0].rstrip())
count = 0
for j in range(1, len(lines)):
line = lines[j].rstrip()
if not line.strip():
continue
info_list.append("Log|" + line.strip())
count += 1
if count >= 50:
break
return info_list

def dump_amdgpu_modinfo_in_csv(self, log_path):
"""Parse modinfo output (key:value per line, like lscpu)."""
lines = self.get_log_file_data(log_path)
info_list = []
info_list.append(lines[0].rstrip())
for j in range(1, len(lines)):
line = lines[j].rstrip()
if not line.strip():
continue
if ":" in line:
key, _, value = line.partition(":")
info_list.append(key.strip() + "|" + value.strip())
return info_list
Comment on lines +224 to +287

def dump_cuda_information_in_csv(self, log_path):
lines = self.get_log_file_data(log_path)
info_list = []
Expand Down Expand Up @@ -291,6 +360,14 @@ def dump_csv_output(self):
sys_config_info.extend(self.dump_pip_list_in_csv(log_path))
if tag == "numa_balancing":
sys_config_info.extend(self.dump_numa_balancing_in_csv(log_path))
if tag == "hardware_information":
sys_config_info.extend(self.dump_hardware_information_in_csv(log_path))
if tag == "bios_settings":
sys_config_info.extend(self.dump_bios_settings_in_csv(log_path))
if tag == "dmsg_gpu_drm_atom_logs":
sys_config_info.extend(self.dump_dmsg_gpu_drm_atom_logs_in_csv(log_path))
if tag == "amdgpu_modinfo":
sys_config_info.extend(self.dump_amdgpu_modinfo_in_csv(log_path))

self.sys_config_info_list = sys_config_info

Expand Down
64 changes: 62 additions & 2 deletions src/madengine/scripts/common/pre_scripts/run_rocenv_tool.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,66 @@
#

OUTPUT_FILE_NAME=${1:-"sys_config_info"}
ROCENV_MODE=${2:-"lite"}
# Third arg or MAD_GUEST_OS: must match madengine additional_context guest_os (UBUNTU, CENTOS, ...)
_GUEST_RAW=${3:-${MAD_GUEST_OS:-}}
if [ -z "${_GUEST_RAW}" ]; then
ROCENV_GUEST_OS="UBUNTU"
else
ROCENV_GUEST_OS=$(printf '%s' "${_GUEST_RAW}" | tr '[:lower:]' '[:upper:]')
fi

# Best-effort install for rocenv "full" mode, keyed off madengine guest_os (not /etc/os-release).
rocenv_install_diagnostic_packages() {
local guest="$1"
local pkgs="$2"

case "${guest}" in
UBUNTU)
if ! command -v apt-get >/dev/null 2>&1; then
echo "Warning: guest_os is UBUNTU but apt-get not found in this image; skipping package install for:${pkgs}"
return 1
fi
apt-get update -qq >/dev/null 2>&1 && \
apt-get install -y -qq --no-install-recommends ${pkgs} >/dev/null 2>&1
;;
CENTOS)
if command -v microdnf >/dev/null 2>&1; then
microdnf install -y -q ${pkgs} >/dev/null 2>&1 && return 0
fi
if command -v dnf >/dev/null 2>&1; then
dnf install -y -q ${pkgs} >/dev/null 2>&1 && return 0
fi
if command -v yum >/dev/null 2>&1; then
yum install -y -q ${pkgs} >/dev/null 2>&1 && return 0
fi
echo "Warning: guest_os is CENTOS but no microdnf, dnf, or yum found; skipping package install for:${pkgs}"
return 1
;;
*)
echo "Warning: rocenv full mode auto-install is not implemented for guest_os=${guest} (supported: UBUNTU, CENTOS). Missing tools:${pkgs}"
return 1
;;
esac
}

LITE_FLAG="--lite"
if [ "$ROCENV_MODE" = "full" ]; then
LITE_FLAG=""
# Install diagnostic tools on-demand if missing (best-effort)
# These are needed for hardware_information, bios_settings,
# dmsg_gpu_drm_atom_logs, and amdgpu_modinfo sections
MISSING_PKGS=""
command -v lshw >/dev/null 2>&1 || MISSING_PKGS="${MISSING_PKGS:+$MISSING_PKGS }lshw"
command -v dmidecode >/dev/null 2>&1 || MISSING_PKGS="${MISSING_PKGS:+$MISSING_PKGS }dmidecode"
command -v modinfo >/dev/null 2>&1 || MISSING_PKGS="${MISSING_PKGS:+$MISSING_PKGS }kmod"
command -v dmesg >/dev/null 2>&1 || MISSING_PKGS="${MISSING_PKGS:+$MISSING_PKGS }util-linux"
if [ -n "$MISSING_PKGS" ]; then
echo "rocenv full mode (guest_os=${ROCENV_GUEST_OS}): installing missing diagnostic tools: ${MISSING_PKGS}"
rocenv_install_diagnostic_packages "${ROCENV_GUEST_OS}" "${MISSING_PKGS}" || \
echo "Warning: could not install some diagnostic tools (network, permissions, or unsupported guest_os)"
fi
fi

# Determine the script's directory
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
Expand All @@ -13,7 +73,7 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
if [ -d "$SCRIPT_DIR/rocEnvTool" ]; then
# K8s execution: rocEnvTool is already in place
cd "$SCRIPT_DIR/rocEnvTool"
python3 rocenv_tool.py --lite --dump-csv --print-csv --output-name $OUTPUT_FILE_NAME
python3 rocenv_tool.py $LITE_FLAG --dump-csv --print-csv --output-name $OUTPUT_FILE_NAME
out_dir="."$OUTPUT_FILE_NAME
out_csv=$OUTPUT_FILE_NAME".csv"
# Copy results back to workspace root
Expand All @@ -27,7 +87,7 @@ else
# Local execution: copy rocEnvTool from relative path
cp -r ../scripts/common/pre_scripts/rocEnvTool .
cd rocEnvTool
python3 rocenv_tool.py --lite --dump-csv --print-csv --output-name $OUTPUT_FILE_NAME
python3 rocenv_tool.py $LITE_FLAG --dump-csv --print-csv --output-name $OUTPUT_FILE_NAME
out_dir="."$OUTPUT_FILE_NAME
out_csv=$OUTPUT_FILE_NAME".csv"
cp -r $out_dir ../../
Expand Down
32 changes: 27 additions & 5 deletions src/madengine/scripts/common/pre_scripts/trace.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,20 +13,42 @@ case "$tool" in

rpd)
# OS packages only needed for RPD build; other tools (e.g. rocm_trace_lite) skip this.
# Docker madengine runs often use root with no sudo — use apt-get/yum directly when uid==0.
os=''
if command -v apt >/dev/null 2>&1; then
if command -v apt-get >/dev/null 2>&1; then
os=ubuntu
elif command -v yum >/dev/null 2>&1; then
os=centos
else
echo 'Unable to detect Host OS in pre_script (need apt or yum for RPD dependencies)' >&2
echo 'Unable to detect Host OS in pre_script (need apt-get or yum for RPD dependencies)' >&2
exit 1
fi
if [ "$os" == 'ubuntu' ]; then
sudo apt update
sudo apt install -y sqlite3 libsqlite3-dev libfmt-dev python3-pip nlohmann-json3-dev
if [ "$(id -u)" -eq 0 ]; then
apt-get update -qq
DEBIAN_FRONTEND=noninteractive apt-get install -y -qq \
sqlite3 libsqlite3-dev libfmt-dev python3-pip nlohmann-json3-dev \
git build-essential pkg-config xxd
elif command -v sudo >/dev/null 2>&1; then
sudo apt-get update -qq
sudo DEBIAN_FRONTEND=noninteractive apt-get install -y -qq \
sqlite3 libsqlite3-dev libfmt-dev python3-pip nlohmann-json3-dev \
git build-essential pkg-config xxd
else
echo 'RPD pre-script: need root or sudo for apt-get' >&2
exit 1
fi
elif [ "$os" == 'centos' ]; then
sudo yum install -y libsqlite3x-devel.x86_64 fmt-devel python3-pip json-devel
if [ "$(id -u)" -eq 0 ]; then
yum install -y gcc gcc-c++ make git \
libsqlite3x-devel.x86_64 fmt-devel python3-pip json-devel vim-common
elif command -v sudo >/dev/null 2>&1; then
sudo yum install -y gcc gcc-c++ make git \
libsqlite3x-devel.x86_64 fmt-devel python3-pip json-devel vim-common
else
echo 'RPD pre-script: need root or sudo for yum' >&2
exit 1
fi
Comment on lines 17 to +51
else
echo "Unable to detect Host OS in trace pre-script"
fi
Expand Down
3 changes: 2 additions & 1 deletion tests/e2e/test_profiling_workflows.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,7 +316,8 @@ def test_rccl_trace_runs_correctly(self, global_data, clean_test_temp_files):
canFail=False,
)

regexp = re.compile(r"NCCL INFO AllReduce:")
# RCCL log prefix/format varies slightly by RCCL build; keep assertion anchored on NCCL + AllReduce.
regexp = re.compile(r"NCCL\s+INFO\s+.*AllReduce", re.IGNORECASE)
foundMatch = None
with open(
os.path.join(
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# CONTEXT {'gpu_vendor': 'AMD', 'guest_os': 'UBUNTU'}
ARG BASE_DOCKER=rocm/pytorch
FROM $BASE_DOCKER

# Install system diagnostic tools required by rocenv_tool.py full mode:
# lshw -> hardware_information
# dmidecode -> bios_settings
# kmod -> amdgpu_modinfo (provides modinfo)
# util-linux -> dmsg_gpu_drm_atom_logs (provides dmesg)
RUN apt-get update && \
apt-get install -y --no-install-recommends lshw dmidecode kmod util-linux && \
rm -rf /var/lib/apt/lists/*
Loading