diff --git a/src/madengine/deployment/k8s_scripts.py b/src/madengine/deployment/k8s_scripts.py index 2583c822..277661e0 100644 --- a/src/madengine/deployment/k8s_scripts.py +++ b/src/madengine/deployment/k8s_scripts.py @@ -12,8 +12,9 @@ import json import re from pathlib import Path -from typing import Dict, List +from typing import Dict, List, Optional +from madengine.core.additional_context_defaults import DEFAULT_GUEST_OS from madengine.utils.path_utils import get_madengine_root from .primus_backend import ( @@ -26,7 +27,11 @@ class KubernetesScriptsMixin: """Script and tool loading for Kubernetes ConfigMap embedding.""" def gather_system_env_details( - self, pre_scripts: List[Dict], model_name: str + self, + pre_scripts: List[Dict], + model_name: str, + rocenv_mode: str = "lite", + guest_os: Optional[str] = None, ) -> None: """ Gather system environment details by adding rocEnvTool to pre-scripts. @@ -36,13 +41,20 @@ def gather_system_env_details( Args: pre_scripts: List of pre-script configurations model_name: The model name (used for output file naming) + rocenv_mode: Collection mode - "lite" (default) or "full" + guest_os: UBUNTU / CENTOS (madengine additional_context); defaults to UBUNTU """ + if rocenv_mode not in ("lite", "full"): + self.console.print(f"[yellow]Warning: Unknown rocenv_mode '{rocenv_mode}', defaulting to 'lite'[/yellow]") + rocenv_mode = "lite" + go = (guest_os or DEFAULT_GUEST_OS).strip().upper() or DEFAULT_GUEST_OS + output_name = model_name.replace("/", "_") + "_env" pre_env_details = { "path": "scripts/common/pre_scripts/run_rocenv_tool.sh", - "args": model_name.replace("/", "_") + "_env" + "args": f"{output_name} {rocenv_mode} {go}", } pre_scripts.append(pre_env_details) - self.console.print(f"[dim]Added rocEnvTool to pre-scripts with args: {pre_env_details['args']}[/dim]") + self.console.print(f"[dim]Added rocEnvTool (mode={rocenv_mode}) to pre-scripts with args: {pre_env_details['args']}[/dim]") def _add_tool_scripts(self, pre_scripts: List[Dict], post_scripts: List[Dict]) -> None: """ diff --git a/src/madengine/deployment/k8s_template_context.py b/src/madengine/deployment/k8s_template_context.py index 78c01072..bdb43d0e 100644 --- a/src/madengine/deployment/k8s_template_context.py +++ b/src/madengine/deployment/k8s_template_context.py @@ -28,6 +28,7 @@ infer_primus_backend_from_model_name, merged_primus_config, ) +from madengine.core.additional_context_defaults import DEFAULT_GUEST_OS from madengine.core.dataprovider import Data from madengine.core.errors import ConfigurationError from madengine.utils.gpu_config import resolve_runtime_gpus @@ -418,7 +419,16 @@ def _prepare_template_context( # This is controlled by generate_sys_env_details flag (default: True) generate_sys_env_details = self.config.additional_context.get("generate_sys_env_details", True) if generate_sys_env_details: - self.gather_system_env_details(pre_scripts, model_info["name"]) + rocenv_mode = self.config.additional_context.get("rocenv_mode", "lite") + guest_os = str( + self.config.additional_context.get("guest_os") or DEFAULT_GUEST_OS + ).strip().upper() or DEFAULT_GUEST_OS + self.gather_system_env_details( + pre_scripts, + model_info["name"], + rocenv_mode=rocenv_mode, + guest_os=guest_os, + ) # Add tool pre/post scripts to the execution lists (like local execution) self._add_tool_scripts(pre_scripts, post_scripts) diff --git a/src/madengine/execution/container_runner.py b/src/madengine/execution/container_runner.py index 2ffc8a31..991e14bb 100644 --- a/src/madengine/execution/container_runner.py +++ b/src/madengine/execution/container_runner.py @@ -34,6 +34,7 @@ from madengine.utils.config_parser import ConfigParser from madengine.utils.path_utils import scripts_base_dir_from from madengine.utils.run_details import get_build_number, get_pipeline +from madengine.core.additional_context_defaults import DEFAULT_GUEST_OS from madengine.utils.therock_markers import is_therock_tree from madengine.deployment.base import PERFORMANCE_LOG_PATTERN from madengine.execution.container_runner_helpers import ( @@ -793,6 +794,12 @@ def gather_system_env_details( ) -> None: """Gather system environment details. + Appends ``run_rocenv_tool.sh`` to pre_scripts with args: + `` `` (e.g. ``my_model_env lite UBUNTU``). + ``guest_os`` comes from ``docker_env_vars.MAD_GUEST_OS`` (if set) else + ``context.ctx['guest_os']``, defaulting to ``UBUNTU`` — aligned with + ``MAD_GUEST_OS`` injected for the container before this runs. + Args: pre_encapsulate_post_scripts: The pre, encapsulate and post scripts. model_name: The model name. @@ -809,7 +816,16 @@ def gather_system_env_details( # initialize pre_env_details pre_env_details = {} pre_env_details["path"] = "scripts/common/pre_scripts/run_rocenv_tool.sh" - pre_env_details["args"] = model_name.replace("/", "_") + "_env" + output_name = model_name.replace("/", "_") + "_env" + rocenv_mode = self.context.ctx.get("rocenv_mode", "lite") + if rocenv_mode not in ("lite", "full"): + print(f"Warning: Unknown rocenv_mode '{rocenv_mode}', defaulting to 'lite'") + rocenv_mode = "lite" + dv = self.context.ctx.get("docker_env_vars") or {} + guest_os = str( + dv.get("MAD_GUEST_OS") or self.context.ctx.get("guest_os", DEFAULT_GUEST_OS) + ).strip().upper() + pre_env_details["args"] = f"{output_name} {rocenv_mode} {guest_os}" pre_encapsulate_post_scripts["pre_scripts"].append(pre_env_details) print(f"pre encap post scripts: {pre_encapsulate_post_scripts}") @@ -947,7 +963,7 @@ def run_container( # Also check shell environment for SLURM-passed variables if "docker_env_vars" not in self.context.ctx: self.context.ctx["docker_env_vars"] = {} - + # For SLURM jobs, check shell environment and populate additional_context with GPU info # This ensures GPU resolution works correctly if os.environ.get("MAD_DEPLOYMENT_TYPE") == "slurm": @@ -1009,6 +1025,12 @@ def run_container( if merged_count > 0: print(f"â„šī¸ Merged {merged_count} environment variables from additional_context") + # rocEnvTool full-mode installs: align container with madengine guest_os (after docker_env_vars merge) + if "MAD_GUEST_OS" not in self.context.ctx["docker_env_vars"]: + self.context.ctx["docker_env_vars"]["MAD_GUEST_OS"] = str( + self.context.ctx.get("guest_os", DEFAULT_GUEST_OS) + ).strip().upper() + if self.context and str(self.context.ctx.get("gpu_vendor", "")).upper().find( "AMD" ) != -1: diff --git a/src/madengine/scripts/common/pre_scripts/rocEnvTool/csv_parser.py b/src/madengine/scripts/common/pre_scripts/rocEnvTool/csv_parser.py index 7c1599ab..a94cc095 100644 --- a/src/madengine/scripts/common/pre_scripts/rocEnvTool/csv_parser.py +++ b/src/madengine/scripts/common/pre_scripts/rocEnvTool/csv_parser.py @@ -21,6 +21,10 @@ rocm_env_variables pip_list numa_balancing +hardware_information (full mode only - lshw) +bios_settings (full mode only - dmidecode) +dmsg_gpu_drm_atom_logs (full mode only - dmesg) +amdgpu_modinfo (full mode only - modinfo) ''' class CSVParser: def __init__(self, filename, sys_config_files_path, tags, path_resolver=None): @@ -217,6 +221,71 @@ def dump_numa_balancing_in_csv(self, log_path): info_list.append("Numa Balacing|" + lines[1].rstrip()) return info_list + def dump_hardware_information_in_csv(self, log_path): + """Parse lshw output, extracting key hardware fields.""" + lines = self.get_log_file_data(log_path) + info_list = [] + info_list.append(lines[0].rstrip()) + keywords = ("product:", "vendor:", "serial:", "width:", "size:", + "description:", "capabilities:", "clock:") + for j in range(1, len(lines)): + line = lines[j].rstrip() + if not line.strip(): + continue + line_lower = line.strip().lower() + for kw in keywords: + if line_lower.startswith(kw): + key, _, value = line.strip().partition(":") + info_list.append(key.strip() + "|" + value.strip()) + break + return info_list + + def dump_bios_settings_in_csv(self, log_path): + """Parse dmidecode output, extracting key:value pairs.""" + lines = self.get_log_file_data(log_path) + info_list = [] + info_list.append(lines[0].rstrip()) + for j in range(1, len(lines)): + line = lines[j].rstrip() + if not line.strip() or line.startswith("Handle ") or line.startswith("#"): + continue + if ":" in line and line[0] in (" ", "\t"): + key, _, value = line.strip().partition(":") + value = value.strip() + if value: + info_list.append(key.strip() + "|" + value) + return info_list + + def dump_dmsg_gpu_drm_atom_logs_in_csv(self, log_path): + """Parse dmesg filtered log lines.""" + lines = self.get_log_file_data(log_path) + info_list = [] + info_list.append(lines[0].rstrip()) + count = 0 + for j in range(1, len(lines)): + line = lines[j].rstrip() + if not line.strip(): + continue + info_list.append("Log|" + line.strip()) + count += 1 + if count >= 50: + break + return info_list + + def dump_amdgpu_modinfo_in_csv(self, log_path): + """Parse modinfo output (key:value per line, like lscpu).""" + lines = self.get_log_file_data(log_path) + info_list = [] + info_list.append(lines[0].rstrip()) + for j in range(1, len(lines)): + line = lines[j].rstrip() + if not line.strip(): + continue + if ":" in line: + key, _, value = line.partition(":") + info_list.append(key.strip() + "|" + value.strip()) + return info_list + def dump_cuda_information_in_csv(self, log_path): lines = self.get_log_file_data(log_path) info_list = [] @@ -291,6 +360,14 @@ def dump_csv_output(self): sys_config_info.extend(self.dump_pip_list_in_csv(log_path)) if tag == "numa_balancing": sys_config_info.extend(self.dump_numa_balancing_in_csv(log_path)) + if tag == "hardware_information": + sys_config_info.extend(self.dump_hardware_information_in_csv(log_path)) + if tag == "bios_settings": + sys_config_info.extend(self.dump_bios_settings_in_csv(log_path)) + if tag == "dmsg_gpu_drm_atom_logs": + sys_config_info.extend(self.dump_dmsg_gpu_drm_atom_logs_in_csv(log_path)) + if tag == "amdgpu_modinfo": + sys_config_info.extend(self.dump_amdgpu_modinfo_in_csv(log_path)) self.sys_config_info_list = sys_config_info diff --git a/src/madengine/scripts/common/pre_scripts/run_rocenv_tool.sh b/src/madengine/scripts/common/pre_scripts/run_rocenv_tool.sh index 84879d05..61cf5f2a 100644 --- a/src/madengine/scripts/common/pre_scripts/run_rocenv_tool.sh +++ b/src/madengine/scripts/common/pre_scripts/run_rocenv_tool.sh @@ -5,6 +5,66 @@ # OUTPUT_FILE_NAME=${1:-"sys_config_info"} +ROCENV_MODE=${2:-"lite"} +# Third arg or MAD_GUEST_OS: must match madengine additional_context guest_os (UBUNTU, CENTOS, ...) +_GUEST_RAW=${3:-${MAD_GUEST_OS:-}} +if [ -z "${_GUEST_RAW}" ]; then + ROCENV_GUEST_OS="UBUNTU" +else + ROCENV_GUEST_OS=$(printf '%s' "${_GUEST_RAW}" | tr '[:lower:]' '[:upper:]') +fi + +# Best-effort install for rocenv "full" mode, keyed off madengine guest_os (not /etc/os-release). +rocenv_install_diagnostic_packages() { + local guest="$1" + local pkgs="$2" + + case "${guest}" in + UBUNTU) + if ! command -v apt-get >/dev/null 2>&1; then + echo "Warning: guest_os is UBUNTU but apt-get not found in this image; skipping package install for:${pkgs}" + return 1 + fi + apt-get update -qq >/dev/null 2>&1 && \ + apt-get install -y -qq --no-install-recommends ${pkgs} >/dev/null 2>&1 + ;; + CENTOS) + if command -v microdnf >/dev/null 2>&1; then + microdnf install -y -q ${pkgs} >/dev/null 2>&1 && return 0 + fi + if command -v dnf >/dev/null 2>&1; then + dnf install -y -q ${pkgs} >/dev/null 2>&1 && return 0 + fi + if command -v yum >/dev/null 2>&1; then + yum install -y -q ${pkgs} >/dev/null 2>&1 && return 0 + fi + echo "Warning: guest_os is CENTOS but no microdnf, dnf, or yum found; skipping package install for:${pkgs}" + return 1 + ;; + *) + echo "Warning: rocenv full mode auto-install is not implemented for guest_os=${guest} (supported: UBUNTU, CENTOS). Missing tools:${pkgs}" + return 1 + ;; + esac +} + +LITE_FLAG="--lite" +if [ "$ROCENV_MODE" = "full" ]; then + LITE_FLAG="" + # Install diagnostic tools on-demand if missing (best-effort) + # These are needed for hardware_information, bios_settings, + # dmsg_gpu_drm_atom_logs, and amdgpu_modinfo sections + MISSING_PKGS="" + command -v lshw >/dev/null 2>&1 || MISSING_PKGS="${MISSING_PKGS:+$MISSING_PKGS }lshw" + command -v dmidecode >/dev/null 2>&1 || MISSING_PKGS="${MISSING_PKGS:+$MISSING_PKGS }dmidecode" + command -v modinfo >/dev/null 2>&1 || MISSING_PKGS="${MISSING_PKGS:+$MISSING_PKGS }kmod" + command -v dmesg >/dev/null 2>&1 || MISSING_PKGS="${MISSING_PKGS:+$MISSING_PKGS }util-linux" + if [ -n "$MISSING_PKGS" ]; then + echo "rocenv full mode (guest_os=${ROCENV_GUEST_OS}): installing missing diagnostic tools: ${MISSING_PKGS}" + rocenv_install_diagnostic_packages "${ROCENV_GUEST_OS}" "${MISSING_PKGS}" || \ + echo "Warning: could not install some diagnostic tools (network, permissions, or unsupported guest_os)" + fi +fi # Determine the script's directory SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" @@ -13,7 +73,7 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" if [ -d "$SCRIPT_DIR/rocEnvTool" ]; then # K8s execution: rocEnvTool is already in place cd "$SCRIPT_DIR/rocEnvTool" - python3 rocenv_tool.py --lite --dump-csv --print-csv --output-name $OUTPUT_FILE_NAME + python3 rocenv_tool.py $LITE_FLAG --dump-csv --print-csv --output-name $OUTPUT_FILE_NAME out_dir="."$OUTPUT_FILE_NAME out_csv=$OUTPUT_FILE_NAME".csv" # Copy results back to workspace root @@ -27,7 +87,7 @@ else # Local execution: copy rocEnvTool from relative path cp -r ../scripts/common/pre_scripts/rocEnvTool . cd rocEnvTool - python3 rocenv_tool.py --lite --dump-csv --print-csv --output-name $OUTPUT_FILE_NAME + python3 rocenv_tool.py $LITE_FLAG --dump-csv --print-csv --output-name $OUTPUT_FILE_NAME out_dir="."$OUTPUT_FILE_NAME out_csv=$OUTPUT_FILE_NAME".csv" cp -r $out_dir ../../ diff --git a/src/madengine/scripts/common/pre_scripts/trace.sh b/src/madengine/scripts/common/pre_scripts/trace.sh index 5c591c83..f6b8e624 100644 --- a/src/madengine/scripts/common/pre_scripts/trace.sh +++ b/src/madengine/scripts/common/pre_scripts/trace.sh @@ -13,20 +13,42 @@ case "$tool" in rpd) # OS packages only needed for RPD build; other tools (e.g. rocm_trace_lite) skip this. + # Docker madengine runs often use root with no sudo — use apt-get/yum directly when uid==0. os='' - if command -v apt >/dev/null 2>&1; then + if command -v apt-get >/dev/null 2>&1; then os=ubuntu elif command -v yum >/dev/null 2>&1; then os=centos else - echo 'Unable to detect Host OS in pre_script (need apt or yum for RPD dependencies)' >&2 + echo 'Unable to detect Host OS in pre_script (need apt-get or yum for RPD dependencies)' >&2 exit 1 fi if [ "$os" == 'ubuntu' ]; then - sudo apt update - sudo apt install -y sqlite3 libsqlite3-dev libfmt-dev python3-pip nlohmann-json3-dev + if [ "$(id -u)" -eq 0 ]; then + apt-get update -qq + DEBIAN_FRONTEND=noninteractive apt-get install -y -qq \ + sqlite3 libsqlite3-dev libfmt-dev python3-pip nlohmann-json3-dev \ + git build-essential pkg-config xxd + elif command -v sudo >/dev/null 2>&1; then + sudo apt-get update -qq + sudo DEBIAN_FRONTEND=noninteractive apt-get install -y -qq \ + sqlite3 libsqlite3-dev libfmt-dev python3-pip nlohmann-json3-dev \ + git build-essential pkg-config xxd + else + echo 'RPD pre-script: need root or sudo for apt-get' >&2 + exit 1 + fi elif [ "$os" == 'centos' ]; then - sudo yum install -y libsqlite3x-devel.x86_64 fmt-devel python3-pip json-devel + if [ "$(id -u)" -eq 0 ]; then + yum install -y gcc gcc-c++ make git \ + libsqlite3x-devel.x86_64 fmt-devel python3-pip json-devel vim-common + elif command -v sudo >/dev/null 2>&1; then + sudo yum install -y gcc gcc-c++ make git \ + libsqlite3x-devel.x86_64 fmt-devel python3-pip json-devel vim-common + else + echo 'RPD pre-script: need root or sudo for yum' >&2 + exit 1 + fi else echo "Unable to detect Host OS in trace pre-script" fi diff --git a/tests/e2e/test_profiling_workflows.py b/tests/e2e/test_profiling_workflows.py index 74925ae2..acc0b960 100644 --- a/tests/e2e/test_profiling_workflows.py +++ b/tests/e2e/test_profiling_workflows.py @@ -316,7 +316,8 @@ def test_rccl_trace_runs_correctly(self, global_data, clean_test_temp_files): canFail=False, ) - regexp = re.compile(r"NCCL INFO AllReduce:") + # RCCL log prefix/format varies slightly by RCCL build; keep assertion anchored on NCCL + AllReduce. + regexp = re.compile(r"NCCL\s+INFO\s+.*AllReduce", re.IGNORECASE) foundMatch = None with open( os.path.join( diff --git a/tests/fixtures/dummy/docker/dummy_rocenv_full.ubuntu.amd.Dockerfile b/tests/fixtures/dummy/docker/dummy_rocenv_full.ubuntu.amd.Dockerfile new file mode 100644 index 00000000..019606dd --- /dev/null +++ b/tests/fixtures/dummy/docker/dummy_rocenv_full.ubuntu.amd.Dockerfile @@ -0,0 +1,12 @@ +# CONTEXT {'gpu_vendor': 'AMD', 'guest_os': 'UBUNTU'} +ARG BASE_DOCKER=rocm/pytorch +FROM $BASE_DOCKER + +# Install system diagnostic tools required by rocenv_tool.py full mode: +# lshw -> hardware_information +# dmidecode -> bios_settings +# kmod -> amdgpu_modinfo (provides modinfo) +# util-linux -> dmsg_gpu_drm_atom_logs (provides dmesg) +RUN apt-get update && \ + apt-get install -y --no-install-recommends lshw dmidecode kmod util-linux && \ + rm -rf /var/lib/apt/lists/* diff --git a/tests/fixtures/dummy/models.json b/tests/fixtures/dummy/models.json index 140779ab..8c8a29c1 100644 --- a/tests/fixtures/dummy/models.json +++ b/tests/fixtures/dummy/models.json @@ -439,6 +439,19 @@ ], "args": "" }, + { + "name": "dummy_rocenv_full", + "dockerfile": "docker/dummy_rocenv_full", + "scripts": "scripts/dummy_rocenv_full/run.sh", + "n_gpus": "1", + "owner": "mad.support@amd.com", + "training_precision": "", + "tags": [ + "dummies", + "dummy_rocenv_full" + ], + "args": "" + }, { "name": "dummy_therock", "dockerfile": "docker/dummy_therock", diff --git a/tests/fixtures/dummy/scripts/dummy/run_nccl_trace.sh b/tests/fixtures/dummy/scripts/dummy/run_nccl_trace.sh index da2a8798..b55df647 100644 --- a/tests/fixtures/dummy/scripts/dummy/run_nccl_trace.sh +++ b/tests/fixtures/dummy/scripts/dummy/run_nccl_trace.sh @@ -1,9 +1,29 @@ -#!/bin/bash -# +#!/usr/bin/env bash +# # Copyright (c) Advanced Micro Devices, Inc. # All rights reserved. -# +# +# Single-process NCCL sanity check for RCCL_TRACE e2e tests. On hosts where the +# container exposes fewer GPUs than the physical topology (e.g. MI300X multi-OAM), +# RCCL can spend a long time probing inaccessible ROCr/KFD nodes without these guards. -python -c "import torch; import torch.distributed as dist; import os; os.environ['MASTER_ADDR'] = 'localhost'; os.environ['MASTER_PORT'] = '29501'; dist.init_process_group('nccl', rank=0, world_size=1);tensor = torch.arange(1, dtype=torch.int64).cuda(); dist.all_reduce(tensor, op=dist.ReduceOp.SUM); print(tensor[0]); " | tee log.txt - -echo "performance: 1 pass" +export HIP_VISIBLE_DEVICES="${HIP_VISIBLE_DEVICES:-0}" +export NCCL_IB_DISABLE="${NCCL_IB_DISABLE:-1}" +export NCCL_SOCKET_IFNAME="${NCCL_SOCKET_IFNAME:-lo}" +export MASTER_ADDR="${MASTER_ADDR:-127.0.0.1}" +export MASTER_PORT="${MASTER_PORT:-29501}" + +python3 -c " +import os +import torch +import torch.distributed as dist + +os.environ.setdefault('MASTER_ADDR', '127.0.0.1') +os.environ.setdefault('MASTER_PORT', '29501') +dist.init_process_group('nccl', rank=0, world_size=1) +tensor = torch.arange(1, dtype=torch.int64).cuda() +dist.all_reduce(tensor, op=dist.ReduceOp.SUM) +print(tensor[0]) +" | tee log.txt + +echo "performance: 1 pass" diff --git a/tests/fixtures/dummy/scripts/dummy_rocenv_full/run.sh b/tests/fixtures/dummy/scripts/dummy_rocenv_full/run.sh new file mode 100644 index 00000000..2c9893f7 --- /dev/null +++ b/tests/fixtures/dummy/scripts/dummy_rocenv_full/run.sh @@ -0,0 +1,7 @@ +#!/bin/bash +# +# Copyright (c) Advanced Micro Devices, Inc. +# All rights reserved. +# + +echo "performance: $RANDOM samples_per_second" diff --git a/tests/unit/test_container_runner.py b/tests/unit/test_container_runner.py index 2c10cbe7..7eb7fe88 100644 --- a/tests/unit/test_container_runner.py +++ b/tests/unit/test_container_runner.py @@ -231,3 +231,63 @@ def test_setup_failure_appends_to_failed_runs_and_records_to_csv( call_kw = mock_update_perf_csv.call_args[1] assert call_kw.get("perf_csv") == perf_csv_path assert "exception_result" in call_kw + + +class TestGatherSystemEnvDetailsRocenvMode: + """gather_system_env_details passes rocenv_mode to run_rocenv_tool.sh args.""" + + def _make_runner(self, ctx_overrides=None): + ctx = MagicMock() + ctx.ctx = ctx_overrides or {} + return ContainerRunner(context=ctx, console=MagicMock()) + + def test_default_mode_is_lite(self): + """When rocenv_mode is absent, args should end with 'lite' and default guest_os UBUNTU.""" + runner = self._make_runner() + pep = {"pre_scripts": [], "encapsulate_script": "", "post_scripts": []} + runner.gather_system_env_details(pep, "my_model") + args = pep["pre_scripts"][0]["args"] + assert args == "my_model_env lite UBUNTU" + + def test_explicit_lite_mode(self): + """When rocenv_mode is 'lite', args should end with 'lite' and guest_os.""" + runner = self._make_runner({"rocenv_mode": "lite"}) + pep = {"pre_scripts": [], "encapsulate_script": "", "post_scripts": []} + runner.gather_system_env_details(pep, "my_model") + args = pep["pre_scripts"][0]["args"] + assert args == "my_model_env lite UBUNTU" + + def test_full_mode(self): + """When rocenv_mode is 'full', args should end with 'full' and guest_os.""" + runner = self._make_runner({"rocenv_mode": "full"}) + pep = {"pre_scripts": [], "encapsulate_script": "", "post_scripts": []} + runner.gather_system_env_details(pep, "org/my_model") + args = pep["pre_scripts"][0]["args"] + assert args == "org_my_model_env full UBUNTU" + + def test_guest_os_centos(self): + """guest_os in context is passed as third arg (uppercased).""" + runner = self._make_runner({"rocenv_mode": "lite", "guest_os": "centos"}) + pep = {"pre_scripts": [], "encapsulate_script": "", "post_scripts": []} + runner.gather_system_env_details(pep, "my_model") + assert pep["pre_scripts"][0]["args"] == "my_model_env lite CENTOS" + + def test_mad_guest_os_overrides_guest_os(self): + """docker_env_vars MAD_GUEST_OS wins over top-level guest_os for script args.""" + runner = self._make_runner( + { + "guest_os": "UBUNTU", + "docker_env_vars": {"MAD_GUEST_OS": "CENTOS"}, + } + ) + pep = {"pre_scripts": [], "encapsulate_script": "", "post_scripts": []} + runner.gather_system_env_details(pep, "my_model") + assert pep["pre_scripts"][0]["args"] == "my_model_env lite CENTOS" + + def test_invalid_mode_falls_back_to_lite(self): + """When rocenv_mode is invalid, should fall back to 'lite'.""" + runner = self._make_runner({"rocenv_mode": "invalid"}) + pep = {"pre_scripts": [], "encapsulate_script": "", "post_scripts": []} + runner.gather_system_env_details(pep, "my_model") + args = pep["pre_scripts"][0]["args"] + assert args == "my_model_env lite UBUNTU" diff --git a/tests/unit/test_k8s.py b/tests/unit/test_k8s.py index 3391b70c..4a1f84a4 100644 --- a/tests/unit/test_k8s.py +++ b/tests/unit/test_k8s.py @@ -230,3 +230,47 @@ def test_max_63_chars(self): long_hint = "a" * 200 c = sanitize_k8s_container_name(long_hint) assert len(c) <= 63 + + +class TestGatherSystemEnvDetailsK8sRocenvMode: + """K8s gather_system_env_details passes rocenv_mode to run_rocenv_tool.sh args.""" + + def _make_mixin(self): + from unittest.mock import MagicMock + from madengine.deployment.k8s_scripts import KubernetesScriptsMixin + + mixin = KubernetesScriptsMixin() + mixin.console = MagicMock() + return mixin + + def test_default_mode_is_lite(self): + mixin = self._make_mixin() + pre_scripts = [] + mixin.gather_system_env_details(pre_scripts, "my_model") + assert pre_scripts[0]["args"] == "my_model_env lite UBUNTU" + + def test_full_mode(self): + mixin = self._make_mixin() + pre_scripts = [] + mixin.gather_system_env_details(pre_scripts, "org/my_model", rocenv_mode="full") + assert pre_scripts[0]["args"] == "org_my_model_env full UBUNTU" + + def test_explicit_lite_mode(self): + mixin = self._make_mixin() + pre_scripts = [] + mixin.gather_system_env_details(pre_scripts, "my_model", rocenv_mode="lite") + assert pre_scripts[0]["args"] == "my_model_env lite UBUNTU" + + def test_guest_os_centos(self): + mixin = self._make_mixin() + pre_scripts = [] + mixin.gather_system_env_details( + pre_scripts, "my_model", rocenv_mode="lite", guest_os="centos" + ) + assert pre_scripts[0]["args"] == "my_model_env lite CENTOS" + + def test_invalid_mode_falls_back_to_lite(self): + mixin = self._make_mixin() + pre_scripts = [] + mixin.gather_system_env_details(pre_scripts, "my_model", rocenv_mode="bogus") + assert pre_scripts[0]["args"] == "my_model_env lite UBUNTU"