diff --git a/.gitignore b/.gitignore index 37d74f4f..6c6ea476 100644 --- a/.gitignore +++ b/.gitignore @@ -25,6 +25,7 @@ Thumbs.db # Distribution / packaging .Python build/ +!src/madengine/configs/build/ develop-eggs/ dist/ downloads/ @@ -144,4 +145,4 @@ rocm_trace_lite_output/ slurm_results/ MagicMock/ .madengine_session_start -run_directory/ \ No newline at end of file +run_directory/ diff --git a/README.md b/README.md index ea3fbbb5..f66dda03 100644 --- a/README.md +++ b/README.md @@ -29,6 +29,7 @@ madengine is a modern CLI tool for running Large Language Models (LLMs) and Deep - [Performance Profiling](#-performance-profiling) - [Reporting and Database](#-reporting-and-database) - [Installation](#-installation) +- [YAML Configuration (`--config`)](#-yaml-configuration-config) - [Tips & Best Practices](#-tips--best-practices) - [Log error pattern scan](#log-error-pattern-scan) - [Exit codes and CI](#exit-codes-and-ci) @@ -39,6 +40,7 @@ madengine is a modern CLI tool for running Large Language Models (LLMs) and Deep ## ✨ Key Features - **🚀 Modern CLI** - Rich terminal output with Typer and Rich +- **📝 YAML Config** - Composable [Hydra-based YAML configs](#-yaml-configuration-config) with config groups, hardware profiles, and CLI overrides — alternative to `--additional-context` JSON - **🎯 Simple Deployment** - Run locally or deploy to Kubernetes/SLURM via configuration - **🔧 Distributed Launchers** - Full support for torchrun, DeepSpeed, Megatron-LM, TorchTitan, Primus, vLLM, SGLang - **🐳 Container-Native** - Docker-based execution with GPU support (ROCm, CUDA) @@ -64,12 +66,16 @@ madengine discover --tags dummy # Run locally (full workflow: discover/build/run as configured by the model) madengine run --tags dummy -# Or with explicit configuration +# Or with explicit JSON configuration madengine run --tags dummy \ --additional-context '{"gpu_vendor": "AMD", "guest_os": "UBUNTU"}' + +# Or with YAML config (Hydra-based, composable) +madengine run --tags dummy --config scheduler=slurm --config launcher=torchrun +madengine run --config my_job.yaml ``` -> **Note**: For build operations, `gpu_vendor` defaults to `AMD` and `guest_os` defaults to `UBUNTU` if not specified. For production deployments or non-AMD/Ubuntu environments, explicitly specify these values. +> **Note**: `--config` is mutually exclusive with `--additional-context` / `--additional-context-file`. For build operations, `gpu_vendor` defaults to `AMD` and `guest_os` defaults to `UBUNTU` if not specified. If auto-detection does not find your **host** ROCm root, set top-level `MAD_ROCM_PATH` in `--additional-context`. For a different ROCm root **inside the container**, set `docker_env_vars.MAD_ROCM_PATH` in additional context. If you omit it, madengine derives in-container `ROCM_PATH` when running Docker (from the image's baked-in env, then an in-container probe, then `/opt/rocm` — it does **not** copy the host path). You can also set `ROCM_PATH` / `MAD_AUTO_ROCM_PATH=0` for **host** behavior as documented in [docs/configuration.md](docs/configuration.md): @@ -127,7 +133,7 @@ For detailed command options, see the **[CLI Command Reference](docs/cli-referen | [Usage Guide](docs/usage.md) | Commands, workflows, and examples ([`--skip-model-run`](docs/usage.md#skip-model-run-after-build)) | | **[CLI Reference](docs/cli-reference.md)** | **Detailed command options and examples** | | [Deployment](docs/deployment.md) | Kubernetes and SLURM deployment | -| [Configuration](docs/configuration.md) | Advanced options; [run log error pattern scan](docs/configuration.md#run-phase-log-error-pattern-scan) | +| [Configuration](docs/configuration.md) | Advanced options; [YAML config (`--config`)](docs/configuration.md#yaml-configuration-config); [run log error pattern scan](docs/configuration.md#run-phase-log-error-pattern-scan) | | [Batch Build](docs/batch-build.md) | Selective builds for CI/CD | | [Launchers](docs/launchers.md) | Distributed training frameworks | | [Profiling](docs/profiling.md) | Performance analysis tools | @@ -565,6 +571,115 @@ cd madengine && pip install -e ".[dev]" See [Installation Guide](docs/installation.md) for detailed instructions. +## 📝 YAML Configuration (`--config`) + +The `--config` flag provides a composable, Hydra-based YAML alternative to `--additional-context` JSON strings. It is available on both `run` and `build` commands. + +> **Note**: `--config` is **mutually exclusive** with `--additional-context` and `--additional-context-file`. Using them together produces an error. + +### Basic Usage + +```bash +# Use a config group override +madengine run --tags dummy --config scheduler=slurm + +# Combine multiple overrides +madengine run --tags dummy \ + --config scheduler=slurm \ + --config launcher=torchrun \ + --config distributed.nnodes=4 + +# Use a user YAML file +madengine run --config my_job.yaml + +# User YAML file with overrides +madengine run --config my_job.yaml --config distributed.nnodes=8 + +# Append optional config groups with '+' prefix +madengine run --tags dummy \ + --config +profile=mi300x_8gpu \ + --config +env=nccl_debug \ + --config +tools=rocprofv3_lightweight +``` + +### Config Groups + +madengine ships with pre-built config groups that compose together: + +| Group | Default | Options | Description | +|-------|---------|---------|-------------| +| `platform` | `docker` | docker, bare_metal, singularity, podman | Execution platform | +| `scheduler` | `local` | local, slurm, k8s | Job scheduler | +| `hardware` | `amd` | amd, nvidia, cpu | GPU vendor and runtime settings | +| `launcher` | `none` | none, torchrun, deepspeed, megatron, torchtitan, vllm, sglang, sglang_disagg, primus, native | Distributed launcher | +| `+profile` | *(none)* | mi300x_8gpu, mi300x_single, mi250x_4gpu, h100_8gpu, a100_8gpu | Hardware profiles (append-only) | +| `+env` | *(none)* | nccl_debug, nccl_tuned, infiniband, miopen_defaults | Environment presets (append-only) | +| `+tools` | *(none)* | rocprofv3_lightweight, rocprofv3_comprehensive, power_profiler, vram_profiler, rocm_trace_lite | Profiling tools (append-only) | +| `+data` | *(none)* | local, s3, minio, nas | Data source config (append-only) | +| `+build` | *(none)* | default, ci, multi_arch | Build presets (append-only) | + +Groups with `+` prefix are append-only — they are not loaded by default and must be explicitly added. + +### User YAML Files + +Create a YAML file for your job and pass it via `--config`: + +```yaml +# my_job.yaml +model: + tags: [dummy] + timeout: 3600 + +debug: true + +env_vars: + MY_VAR: test_value + NCCL_DEBUG: INFO + +distributed: + enabled: true + launcher: torchrun + nnodes: 2 + nproc_per_node: 4 + +slurm: + partition: gpu + time: "02:00:00" +``` + +```bash +madengine run --config my_job.yaml +``` + +User YAML values are merged on top of the base config and config group selections, giving them highest priority. + +### Examples + +```bash +# SLURM multi-node with torchrun +madengine run --tags model \ + --config scheduler=slurm \ + --config launcher=torchrun \ + --config distributed.nnodes=4 + +# MI300x 8-GPU profile with NCCL debug +madengine run --tags model \ + --config +profile=mi300x_8gpu \ + --config +env=nccl_debug + +# NVIDIA hardware with profiling +madengine run --tags model \ + --config hardware=nvidia \ + --config +tools=rocprofv3_lightweight + +# Build with CI preset +madengine build --tags model \ + --config +build=ci \ + --registry docker.io/myorg +``` + +See [Configuration Guide](docs/configuration.md#yaml-configuration-config) for full details, and [`examples/configs/`](examples/configs/) for annotated templates and ready-to-run demo files. + ## 💡 Tips & Best Practices ### General Usage diff --git a/docs/cli-reference.md b/docs/cli-reference.md index 9340ae1c..e1ae31f6 100644 --- a/docs/cli-reference.md +++ b/docs/cli-reference.md @@ -100,6 +100,7 @@ madengine build [OPTIONS] | `--batch-manifest` | | TEXT | `None` | Input batch.json file for batch build mode | | `--additional-context` | `-c` | TEXT | `"{}"` | Additional context as JSON string | | `--additional-context-file` | `-f` | TEXT | `None` | File containing additional context JSON | +| `--config` | | TEXT | `None` | YAML config file and/or Hydra overrides (repeatable). Mutually exclusive with `--additional-context` / `--additional-context-file`. See [Configuration — YAML config](configuration.md#yaml-configuration-config). | | `--clean-docker-cache` | | FLAG | `False` | Rebuild images without using cache | | `--manifest-output` | `-m` | TEXT | `build_manifest.json` | Output file for build manifest | | `--summary-output` | `-s` | TEXT | `None` | Output file for build summary JSON | @@ -142,6 +143,12 @@ madengine build --tags model \ # Real-time output with verbose logging madengine build --tags model --live-output --verbose + +# Build with YAML config (mutually exclusive with --additional-context) +madengine build --tags model --config +build=ci --registry docker.io/myorg + +# Build with user YAML file +madengine build --config my_build.yaml --registry docker.io/myorg ``` **Default Values:** @@ -215,6 +222,7 @@ madengine run [OPTIONS] | `--timeout` | | INT | `-1` | Timeout in seconds (-1=default 7200s, 0=no timeout) | | `--additional-context` | `-c` | TEXT | `"{}"` | Additional context as JSON string | | `--additional-context-file` | `-f` | TEXT | `None` | File containing additional context JSON | +| `--config` | | TEXT | `None` | YAML config file and/or Hydra overrides (repeatable). Mutually exclusive with `--additional-context` / `--additional-context-file`. See [Configuration — YAML config](configuration.md#yaml-configuration-config). | | `--keep-alive` | | FLAG | `False` | Keep Docker containers alive after run | | `--keep-model-dir` | | FLAG | `False` | Keep model directory after run | | `--clean-docker-cache` | | FLAG | `False` | Rebuild images without using cache (full workflow) | @@ -326,9 +334,23 @@ madengine run --tags model --output my_perf_results.csv # Clean up intermediate perf files after run madengine run --tags model --cleanup-perf -# Using configuration file +# Using JSON configuration file madengine run --tags model \ --additional-context-file k8s-config.json + +# Using YAML config (mutually exclusive with --additional-context) +madengine run --tags model \ + --config scheduler=slurm \ + --config launcher=torchrun \ + --config distributed.nnodes=4 + +# YAML config with hardware profile +madengine run --tags model \ + --config +profile=mi300x_8gpu \ + --config +env=nccl_debug + +# User YAML file with overrides +madengine run --config my_job.yaml --config distributed.nnodes=8 ``` **Execution Modes:** @@ -601,6 +623,23 @@ For complex configurations, use JSON files with `--additional-context-file`: To run on specific nodes, add `"nodelist": "node01,node02"` to the `slurm` section. When set, the job runs only on those nodes and node health preflight is skipped. See [examples/slurm-configs/basic/03-multi-node-basic-nodelist.json](../examples/slurm-configs/basic/03-multi-node-basic-nodelist.json). +### YAML Configuration (`--config`) + +As an alternative to JSON, use `--config` with composable Hydra-based YAML: + +```bash +# Config group overrides +madengine run --tags model --config scheduler=slurm --config launcher=torchrun + +# User YAML file +madengine run --config my_job.yaml + +# Append-only groups (profiles, tools, env presets) +madengine run --tags model --config +profile=mi300x_8gpu --config +env=nccl_debug +``` + +`--config` is **mutually exclusive** with `--additional-context` / `--additional-context-file`. See [Configuration Guide — YAML Configuration](configuration.md#yaml-configuration-config) for config groups, user YAML format, and full examples. + ### Run phase: log error pattern scan (optional) These keys apply to **local Docker runs** when madengine post-processes the run log. Use them when substring matches cause false `FAILURE` status (for example benign `RuntimeError:` lines). Full details: [Configuration — Run phase: log error pattern scan](configuration.md#run-phase-log-error-pattern-scan). diff --git a/docs/configuration.md b/docs/configuration.md index 034ac6d8..4409ca97 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -11,7 +11,7 @@ madengine run --tags model \ --additional-context '{"gpu_vendor": "AMD", "guest_os": "UBUNTU"}' ``` -### 2. Configuration File +### 2. JSON Configuration File ```bash madengine run --tags model --additional-context-file config.json @@ -26,6 +26,163 @@ madengine run --tags model --additional-context-file config.json } ``` +### 3. YAML Configuration (`--config`) + +```bash +madengine run --tags model --config scheduler=slurm --config launcher=torchrun +madengine run --config my_job.yaml +``` + +> **Mutual exclusion**: `--config` cannot be combined with `--additional-context` or `--additional-context-file`. Using both produces an error. + +See [YAML Configuration](#yaml-configuration-config) below for full details. + +## YAML Configuration (`--config`) + +The `--config` flag provides composable, Hydra-based YAML configuration as an alternative to JSON strings. It is available on both `run` and `build` commands. + +### How It Works + +1. madengine loads a base `config.yaml` with sensible defaults (AMD hardware, Docker platform, local scheduler) +2. **Config group overrides** (e.g., `scheduler=slurm`) swap in pre-built YAML fragments +3. **Inline overrides** (e.g., `distributed.nnodes=4`) set individual values +4. **User YAML files** (e.g., `my_job.yaml`) merge on top with highest priority + +All four can be combined in a single command: + +```bash +madengine run --config my_job.yaml \ + --config scheduler=slurm \ + --config launcher=torchrun \ + --config distributed.nnodes=4 +``` + +### Config Groups + +madengine ships with pre-built config groups under `src/madengine/configs/`: + +#### Default Groups (swapped via `group=option`) + +| Group | Default | Options | Description | +|-------|---------|---------|-------------| +| `platform` | `docker` | `docker`, `bare_metal`, `singularity`, `podman` | Execution platform | +| `scheduler` | `local` | `local`, `slurm`, `k8s` | Job scheduler — `slurm` and `k8s` add their respective config sections | +| `hardware` | `amd` | `amd`, `nvidia`, `cpu` | Sets `gpu_vendor`, `guest_os`, runtime device config | +| `launcher` | `none` | `none`, `torchrun`, `deepspeed`, `megatron`, `torchtitan`, `vllm`, `sglang`, `sglang_disagg`, `primus`, `native` | Distributed launcher — sets `distributed.enabled`, `distributed.launcher`, and launcher-specific defaults | + +#### Append-Only Groups (added via `+group=option`) + +These are not loaded by default. Use the `+` prefix to add them: + +| Group | Options | Description | +|-------|---------|-------------| +| `+profile` | `mi300x_8gpu`, `mi300x_single`, `mi250x_4gpu`, `h100_8gpu`, `a100_8gpu` | Hardware profiles — sets GPU type, environment variables, distributed settings | +| `+env` | `nccl_debug`, `nccl_tuned`, `infiniband`, `miopen_defaults` | Environment variable presets | +| `+tools` | `rocprofv3_lightweight`, `rocprofv3_comprehensive`, `power_profiler`, `vram_profiler`, `rocm_trace_lite` | Profiling tool presets | +| `+data` | `local`, `s3`, `minio`, `nas` | Data source configuration | +| `+build` | `default`, `ci`, `multi_arch` | Build presets for CI or multi-arch builds | + +### User YAML Files + +Create a job-specific YAML file and pass it via `--config`: + +```yaml +# my_slurm_job.yaml +model: + tags: [my_model] + timeout: 3600 + +debug: true + +env_vars: + MY_VAR: test_value + NCCL_DEBUG: INFO + +distributed: + enabled: true + launcher: torchrun + nnodes: 2 + nproc_per_node: 4 + +slurm: + partition: gpu + time: "02:00:00" +``` + +```bash +madengine run --config my_slurm_job.yaml +``` + +User YAML values merge on top of the base config and any config group selections. You can also combine a user file with overrides: + +```bash +madengine run --config my_slurm_job.yaml --config distributed.nnodes=8 +``` + +### Priority Order + +1. **Inline overrides** (`key=value`) — highest +2. **User YAML file** — merged on top of composed config +3. **Config group selections** (`scheduler=slurm`) +4. **Base config defaults** — lowest + +### Examples + +```bash +# Local run with defaults (AMD, Docker, no distribution) +madengine run --tags dummy --config + +# SLURM multi-node training +madengine run --tags model \ + --config scheduler=slurm \ + --config launcher=torchrun \ + --config distributed.nnodes=4 + +# MI300x 8-GPU profile with NCCL debug +madengine run --tags model \ + --config +profile=mi300x_8gpu \ + --config +env=nccl_debug + +# NVIDIA hardware +madengine run --tags model --config hardware=nvidia + +# Kubernetes with vLLM inference +madengine run --tags model \ + --config scheduler=k8s \ + --config launcher=vllm \ + --config distributed.nnodes=2 + +# Build with CI preset and multi-arch +madengine build --tags model \ + --config +build=ci \ + --registry docker.io/myorg + +# User YAML with profiling +madengine run --config my_job.yaml \ + --config +tools=rocprofv3_lightweight +``` + +### Metadata from Config + +When using `--config`, certain YAML keys are extracted as metadata rather than passed to the internal context: + +- `model.tags` — used as `--tags` if not specified on the CLI +- `model.timeout` — used as `--timeout` if not specified +- `model.container_image` — promoted to `MAD_CONTAINER_IMAGE` in context +- `build.registry` — used as `--registry` if not specified +- `build.target_archs` — used as `--target-archs` if not specified +- `platform`, `output`, `summary_output`, `data_config`, `live_output` — extracted to metadata + +### Validation + +madengine validates the composed config and reports errors for: + +- Conflicting scheduler selections (e.g., both `slurm` and `k8s` sections present) +- `distributed.enabled: true` without a `distributed.launcher` +- Invalid `distributed.nnodes` (must be a positive integer) +- Unsupported `platform.type` (currently only `docker` is supported) +- Unknown top-level config keys (catches typos) + ## Default Configuration Values madengine provides sensible defaults for common AMD/Ubuntu workflows: @@ -389,6 +546,8 @@ Automatically applies (see presets under `src/madengine/deployment/presets/k8s/` } ``` +See [`examples/configs/templates/k8s.yaml`](../examples/configs/templates/k8s.yaml) for the complete annotated YAML template, or [`examples/configs/demo/k8s/`](../examples/configs/demo/k8s/) for ready-to-run examples. + ## SLURM Deployment ### Basic Configuration @@ -414,10 +573,7 @@ Automatically applies (see presets under `src/madengine/deployment/presets/k8s/` "gpus_per_node": 8, "nodes": 2, "nodelist": "node01,node02", - "time": "24:00:00", - "mem": "64G", - "mail_user": "user@example.com", - "mail_type": "ALL" + "time": "24:00:00" } } ``` @@ -428,13 +584,16 @@ Automatically applies (see presets under `src/madengine/deployment/presets/k8s/` - `partition` - SLURM partition name (required) - `account` - Billing account - `qos` - Quality of Service -- `gpus_per_node` - GPUs per node (default: 1) +- `gpus_per_node` - GPUs per node (default: 8) - `nodes` - Number of nodes (default: 1) - `nodelist` - Comma-separated node names to run on (e.g. `"node01,node02"`); when set, job is restricted to these nodes and automatic node health preflight is skipped -- `time` - Wall time limit HH:MM:SS (required) -- `mem` - Memory per node (e.g., "64G") -- `mail_user` - Email for notifications -- `mail_type` - Notification types (BEGIN, END, FAIL, ALL) +- `exclude` - Comma-separated node names to exclude +- `constraint` - Node feature constraint (e.g., `"infiniband"`) +- `time` - Wall time limit HH:MM:SS (default: `"24:00:00"`) +- `exclusive` - Request exclusive node access (default: `true`) +- `modules` - List of environment modules to load +- `network_interface` - Network interface for NCCL/GLOO (e.g., `"ib0"`) +- `shared_workspace` - Explicit NFS/Lustre shared workspace path ### Multi-Node SLURM @@ -454,6 +613,8 @@ Automatically applies (see presets under `src/madengine/deployment/presets/k8s/` } ``` +See [`examples/configs/templates/slurm.yaml`](../examples/configs/templates/slurm.yaml) for the complete annotated YAML template, or [`examples/configs/demo/slurm/`](../examples/configs/demo/slurm/) for ready-to-run examples. + ## Distributed Training ### Launcher Configuration @@ -469,6 +630,8 @@ Automatically applies (see presets under `src/madengine/deployment/presets/k8s/` } ``` +> **YAML config note**: When using `--config`, you must also set `distributed.enabled: true` explicitly. The default config loads `launcher: none` which sets `enabled: false`; setting a launcher alone does not override it. + **Launcher Options:** - `launcher` - Framework name (required) - `nnodes` - Number of nodes diff --git a/docs/superpowers/plans/2026-05-02-config-driven-yaml.md b/docs/superpowers/plans/2026-05-02-config-driven-yaml.md new file mode 100644 index 00000000..01455ba4 --- /dev/null +++ b/docs/superpowers/plans/2026-05-02-config-driven-yaml.md @@ -0,0 +1,1989 @@ +# Config-Driven YAML System Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Add a `--config` CLI argument to madengine that loads Hydra-based YAML configs with composable config groups and CLI override support, backward-compatible with `--additional-context`. + +**Architecture:** A new `src/madengine/config/` package uses Hydra's Compose API (not `@hydra.main`) to load YAML config groups from `src/madengine/configs/`, then a `ConfigTranslator` maps clean YAML keys to the internal `additional_context` dict format that existing orchestrators expect. The `--config` arg is added to the `run` and `build` Typer commands; `--additional-context` still works and takes highest priority. + +**Tech Stack:** hydra-core>=1.3, omegaconf>=2.3 (new deps); Typer (existing CLI); pytest (tests) + +--- + +## File Map + +### New Files — Config Package + +| File | Responsibility | +|------|---------------| +| `src/madengine/config/__init__.py` | Public API: `load_config(config_args) -> (dict, dict)` | +| `src/madengine/config/loader.py` | `HydraConfigLoader` — Compose API wrapper, separates file path from overrides | +| `src/madengine/config/translator.py` | `ConfigTranslator` — maps YAML keys to `additional_context` format | +| `src/madengine/config/schema.py` | `ConfigValidator` — cross-field checks, unknown key detection | + +### New Files — YAML Configs + +| Directory | Files | +|-----------|-------| +| `src/madengine/configs/` | `config.yaml` (root) | +| `src/madengine/configs/platform/` | `docker.yaml`, `bare_metal.yaml`, `singularity.yaml`, `podman.yaml` | +| `src/madengine/configs/scheduler/` | `local.yaml`, `slurm.yaml`, `k8s.yaml` | +| `src/madengine/configs/hardware/` | `amd.yaml`, `nvidia.yaml`, `cpu.yaml` | +| `src/madengine/configs/launcher/` | `none.yaml`, `torchrun.yaml`, `deepspeed.yaml`, `megatron.yaml`, `vllm.yaml`, `sglang.yaml`, `sglang_disagg.yaml`, `torchtitan.yaml`, `primus.yaml`, `native.yaml` | +| `src/madengine/configs/profile/` | `mi300x_8gpu.yaml`, `mi300x_single.yaml`, `mi250x_4gpu.yaml`, `h100_8gpu.yaml`, `a100_8gpu.yaml` | +| `src/madengine/configs/env/` | `nccl_debug.yaml`, `nccl_tuned.yaml`, `infiniband.yaml`, `miopen_defaults.yaml` | +| `src/madengine/configs/tools/` | `rocprofv3_lightweight.yaml`, `rocprofv3_comprehensive.yaml`, `power_profiler.yaml`, `vram_profiler.yaml`, `rocm_trace_lite.yaml` | +| `src/madengine/configs/data/` | `local.yaml`, `s3.yaml`, `minio.yaml`, `nas.yaml` | +| `src/madengine/configs/build/` | `default.yaml`, `ci.yaml`, `multi_arch.yaml` | + +### New Files — Tests + +| File | Responsibility | +|------|---------------| +| `tests/unit/test_hydra_config_loader.py` | HydraConfigLoader unit tests | +| `tests/unit/test_config_translator.py` | ConfigTranslator unit tests | +| `tests/unit/test_config_schema.py` | ConfigValidator unit tests | +| `tests/unit/test_config_integration.py` | End-to-end: `load_config()` → dict | +| `tests/fixtures/configs/` | Test YAML fixtures | + +### Modified Files + +| File | Change | +|------|--------| +| `pyproject.toml` | Add `hydra-core>=1.3`, `omegaconf>=2.3` to dependencies; add `configs` to wheel force-include | +| `src/madengine/cli/commands/run.py` | Add `--config` parameter, config loading + merge logic | +| `src/madengine/cli/commands/build.py` | Add `--config` parameter, config loading + merge logic | + +--- + +### Task 1: Add Dependencies and Wheel Config + +**Files:** +- Modify: `pyproject.toml` + +- [ ] **Step 1: Add hydra-core and omegaconf to dependencies** + +In `pyproject.toml`, add to the `dependencies` list after `"pyyaml>=6.0"`: + +```toml +dependencies = [ + "pandas", + "GitPython", + "jsondiff", + "sqlalchemy", + "paramiko", + "tqdm", + "typing-extensions", + "pymongo", + "toml", + "typer>=0.9.0", + "rich>=13.0.0", + "click>=8.0.0", + "jinja2>=3.0.0", + "pyyaml>=6.0", + "hydra-core>=1.3", + "omegaconf>=2.3", +] +``` + +- [ ] **Step 2: Add configs directory to wheel force-include** + +In the `[tool.hatch.build.targets.wheel.force-include]` section, add: + +```toml +[tool.hatch.build.targets.wheel.force-include] +"src/madengine/scripts" = "madengine/scripts" +"src/madengine/deployment/templates" = "madengine/deployment/templates" +"src/madengine/configs" = "madengine/configs" +``` + +- [ ] **Step 3: Install updated dependencies** + +Run: `pip install -e ".[dev]"` +Expected: Clean install with hydra-core and omegaconf resolved. + +- [ ] **Step 4: Verify imports work** + +Run: `python -c "from hydra import compose, initialize_config_dir; from omegaconf import OmegaConf, DictConfig; print('OK')"` +Expected: Prints `OK`. + +- [ ] **Step 5: Commit** + +```bash +git add pyproject.toml +git commit -m "feat(config): add hydra-core and omegaconf dependencies" +``` + +--- + +### Task 2: Create YAML Config Files — Root and Default Groups + +**Files:** +- Create: `src/madengine/configs/config.yaml` +- Create: `src/madengine/configs/platform/docker.yaml` +- Create: `src/madengine/configs/platform/bare_metal.yaml` +- Create: `src/madengine/configs/platform/singularity.yaml` +- Create: `src/madengine/configs/platform/podman.yaml` +- Create: `src/madengine/configs/scheduler/local.yaml` +- Create: `src/madengine/configs/scheduler/slurm.yaml` +- Create: `src/madengine/configs/scheduler/k8s.yaml` +- Create: `src/madengine/configs/hardware/amd.yaml` +- Create: `src/madengine/configs/hardware/nvidia.yaml` +- Create: `src/madengine/configs/hardware/cpu.yaml` +- Create: `src/madengine/configs/launcher/none.yaml` +- Create: `src/madengine/configs/launcher/torchrun.yaml` +- Create: `src/madengine/configs/launcher/deepspeed.yaml` +- Create: `src/madengine/configs/launcher/megatron.yaml` +- Create: `src/madengine/configs/launcher/vllm.yaml` +- Create: `src/madengine/configs/launcher/sglang.yaml` +- Create: `src/madengine/configs/launcher/sglang_disagg.yaml` +- Create: `src/madengine/configs/launcher/torchtitan.yaml` +- Create: `src/madengine/configs/launcher/primus.yaml` +- Create: `src/madengine/configs/launcher/native.yaml` + +- [ ] **Step 1: Create directory structure** + +```bash +mkdir -p src/madengine/configs/{platform,scheduler,hardware,launcher,profile,env,tools,data,build} +``` + +- [ ] **Step 2: Create root config.yaml** + +Write to `src/madengine/configs/config.yaml`: + +```yaml +defaults: + - platform: docker + - scheduler: local + - hardware: amd + - launcher: none + - _self_ + +model: + tags: [] + manifest_file: null + container_image: null + skip_run: false + timeout: null + +docker: + build_args: {} + env_vars: {} + mounts: {} + gpus: null + cpus: null + additional_run_options: null + keep_alive: false + clean_cache: false + +build: + registry: null + target_archs: [] + manifest_output: build_manifest.json + +env_vars: {} + +debug: false +live_output: false + +log_error: + pattern_scan: true + benign_patterns: [] + patterns: [] + +tools: [] +pre_scripts: [] +post_scripts: [] +encapsulate_script: null + +data_config: data.json + +output: perf.csv +summary_output: null +``` + +- [ ] **Step 3: Create platform configs** + +Write to `src/madengine/configs/platform/docker.yaml`: + +```yaml +# @package _global_ +platform: + type: docker +``` + +Write to `src/madengine/configs/platform/bare_metal.yaml`: + +```yaml +# @package _global_ +platform: + type: bare_metal +``` + +Write to `src/madengine/configs/platform/singularity.yaml`: + +```yaml +# @package _global_ +platform: + type: singularity +``` + +Write to `src/madengine/configs/platform/podman.yaml`: + +```yaml +# @package _global_ +platform: + type: podman +``` + +- [ ] **Step 4: Create scheduler configs** + +Write to `src/madengine/configs/scheduler/local.yaml`: + +```yaml +# @package _global_ +``` + +Write to `src/madengine/configs/scheduler/slurm.yaml`: + +```yaml +# @package _global_ +slurm: + partition: amd-rccl + nodes: 1 + gpus_per_node: 8 + time: "24:00:00" + output_dir: ./slurm_results + exclusive: true + modules: [] + account: null + qos: null + constraint: null + nodelist: null + exclude: null + results_dir: null + shared_workspace: null + network_interface: null + +env_vars: + OMP_NUM_THREADS: "8" + MIOPEN_FIND_MODE: "1" +``` + +Write to `src/madengine/configs/scheduler/k8s.yaml`: + +```yaml +# @package _global_ +k8s: + kubeconfig: ~/.kube/config + namespace: default + image_pull_policy: Always + backoff_limit: 3 + ttl_seconds_after_finished: null + allow_privileged_profiling: null + gpu_count: null + gpu_resource_name: amd.com/gpu + memory: null + memory_limit: null + cpu: null + cpu_limit: null + host_ipc: true + node_selector: {} + tolerations: [] + nfs_storage_class: nfs-banff + local_path_storage_class: local-path + data_storage_class: nfs-banff + recreate_shared_data_pvc: false + results_pvc: null + data_pvc: null + output_dir: null + secrets: + strategy: from_local_credentials + image_pull_secret_names: [] + runtime_secret_name: null + +env_vars: + OMP_NUM_THREADS: "8" +``` + +- [ ] **Step 5: Create hardware configs** + +Write to `src/madengine/configs/hardware/amd.yaml`: + +```yaml +# @package _global_ +gpu_vendor: AMD +guest_os: UBUNTU + +runtime: + devices: + - /dev/kfd + - /dev/dri + - /dev/infiniband + capabilities: + - SYS_PTRACE + security_opts: + - seccomp=unconfined + network_mode: host + ipc: host + groups: + - video + use_gpu_flag: false +``` + +Write to `src/madengine/configs/hardware/nvidia.yaml`: + +```yaml +# @package _global_ +gpu_vendor: NVIDIA +guest_os: UBUNTU + +runtime: + devices: [] + capabilities: [] + security_opts: [] + network_mode: host + ipc: host + groups: [] + use_gpu_flag: true +``` + +Write to `src/madengine/configs/hardware/cpu.yaml`: + +```yaml +# @package _global_ +gpu_vendor: null +guest_os: UBUNTU + +runtime: + devices: [] + capabilities: [] + security_opts: [] + network_mode: null + ipc: null + groups: [] + use_gpu_flag: false +``` + +- [ ] **Step 6: Create launcher configs** + +Write to `src/madengine/configs/launcher/none.yaml`: + +```yaml +# @package _global_ +distributed: + enabled: false +``` + +Write to `src/madengine/configs/launcher/torchrun.yaml`: + +```yaml +# @package _global_ +distributed: + enabled: true + launcher: torchrun + backend: nccl + nnodes: 1 + nproc_per_node: 8 + master_port: 29500 + port: 29500 +``` + +Write to `src/madengine/configs/launcher/deepspeed.yaml`: + +```yaml +# @package _global_ +distributed: + enabled: true + launcher: deepspeed + backend: nccl + nnodes: 1 + nproc_per_node: 8 + master_port: 29500 +``` + +Write to `src/madengine/configs/launcher/megatron.yaml`: + +```yaml +# @package _global_ +distributed: + enabled: true + launcher: torchrun + backend: nccl + nnodes: 1 + nproc_per_node: 8 + master_port: 29500 +``` + +Write to `src/madengine/configs/launcher/vllm.yaml`: + +```yaml +# @package _global_ +distributed: + enabled: true + launcher: vllm + nnodes: 1 + nproc_per_node: 4 + +vllm: + kv_cache_size: 0.7 + max_model_len: null + tensor_parallel_size: null +``` + +Write to `src/madengine/configs/launcher/sglang.yaml`: + +```yaml +# @package _global_ +distributed: + enabled: true + launcher: sglang + backend: nccl + nnodes: 1 + nproc_per_node: 8 + port: 29500 +``` + +Write to `src/madengine/configs/launcher/sglang_disagg.yaml`: + +```yaml +# @package _global_ +distributed: + enabled: true + launcher: sglang-disagg + backend: nccl + nnodes: 3 + nproc_per_node: 8 + port: 29500 + +sglang_disagg: + prefill_nodes: null + decode_nodes: null + transfer_backend: mooncake +``` + +Write to `src/madengine/configs/launcher/torchtitan.yaml`: + +```yaml +# @package _global_ +distributed: + enabled: true + launcher: torchrun + backend: nccl + nnodes: 1 + nproc_per_node: 8 + master_port: 29500 +``` + +Write to `src/madengine/configs/launcher/primus.yaml`: + +```yaml +# @package _global_ +distributed: + enabled: true + launcher: primus + backend: nccl + nnodes: 1 + nproc_per_node: 8 + master_port: 29500 +``` + +Write to `src/madengine/configs/launcher/native.yaml`: + +```yaml +# @package _global_ +distributed: + enabled: true + launcher: native + backend: nccl + nnodes: 1 + nproc_per_node: 8 + master_port: 29500 +``` + +- [ ] **Step 7: Verify Hydra can compose the root config** + +Run: `python -c " +from hydra import compose, initialize_config_dir +from hydra.core.global_hydra import GlobalHydra +from omegaconf import OmegaConf +import os +GlobalHydra.instance().clear() +config_dir = os.path.abspath('src/madengine/configs') +with initialize_config_dir(config_dir=config_dir, version_base=None): + cfg = compose(config_name='config') +print(OmegaConf.to_yaml(cfg)) +"` + +Expected: Prints the full composed YAML with all default groups merged — `gpu_vendor: AMD`, `distributed.enabled: false`, etc. + +- [ ] **Step 8: Commit** + +```bash +git add src/madengine/configs/ +git commit -m "feat(config): add root config.yaml and default config groups" +``` + +--- + +### Task 3: Create YAML Config Files — Append-Only Groups + +**Files:** +- Create: `src/madengine/configs/profile/mi300x_8gpu.yaml` (and 4 others) +- Create: `src/madengine/configs/env/nccl_debug.yaml` (and 3 others) +- Create: `src/madengine/configs/tools/rocprofv3_lightweight.yaml` (and 4 others) +- Create: `src/madengine/configs/data/local.yaml` (and 3 others) +- Create: `src/madengine/configs/build/default.yaml` (and 2 others) + +- [ ] **Step 1: Create profile configs** + +Write to `src/madengine/configs/profile/mi300x_8gpu.yaml`: + +```yaml +# @package _global_ +gpu_type: mi300x +gpu_memory_gb: 192 +gpus_per_node: 8 + +distributed: + nproc_per_node: 8 + +env_vars: + GPU_MAX_HW_QUEUES: "2" + HSA_ENABLE_SDMA: "0" + HSA_FORCE_FINE_GRAIN_PCIE: "1" +``` + +Write to `src/madengine/configs/profile/mi300x_single.yaml`: + +```yaml +# @package _global_ +gpu_type: mi300x +gpu_memory_gb: 192 +gpus_per_node: 1 + +distributed: + nproc_per_node: 1 +``` + +Write to `src/madengine/configs/profile/mi250x_4gpu.yaml`: + +```yaml +# @package _global_ +gpu_type: mi250x +gpu_memory_gb: 128 +gpus_per_node: 4 + +distributed: + nproc_per_node: 4 + +env_vars: + GPU_MAX_HW_QUEUES: "2" + HSA_ENABLE_SDMA: "0" +``` + +Write to `src/madengine/configs/profile/h100_8gpu.yaml`: + +```yaml +# @package _global_ +gpu_vendor: NVIDIA +guest_os: UBUNTU +gpu_type: h100 +gpu_memory_gb: 80 +gpus_per_node: 8 + +runtime: + devices: [] + capabilities: [] + security_opts: [] + network_mode: host + ipc: host + groups: [] + use_gpu_flag: true + +distributed: + nproc_per_node: 8 +``` + +Write to `src/madengine/configs/profile/a100_8gpu.yaml`: + +```yaml +# @package _global_ +gpu_vendor: NVIDIA +guest_os: UBUNTU +gpu_type: a100 +gpu_memory_gb: 80 +gpus_per_node: 8 + +runtime: + devices: [] + capabilities: [] + security_opts: [] + network_mode: host + ipc: host + groups: [] + use_gpu_flag: true + +distributed: + nproc_per_node: 8 +``` + +- [ ] **Step 2: Create env configs** + +Write to `src/madengine/configs/env/nccl_debug.yaml`: + +```yaml +# @package _global_ +env_vars: + NCCL_DEBUG: INFO + NCCL_DEBUG_SUBSYS: "INIT,NET,GRAPH" + TORCH_DISTRIBUTED_DEBUG: DETAIL +``` + +Write to `src/madengine/configs/env/nccl_tuned.yaml`: + +```yaml +# @package _global_ +env_vars: + NCCL_DEBUG: WARN + TORCH_NCCL_HIGH_PRIORITY: "1" + GPU_MAX_HW_QUEUES: "2" + NCCL_TIMEOUT: "600" + TORCH_NCCL_ASYNC_ERROR_HANDLING: "1" +``` + +Write to `src/madengine/configs/env/infiniband.yaml`: + +```yaml +# @package _global_ +env_vars: + NCCL_IB_DISABLE: "0" + NCCL_IB_HCA: "mlx5_0:1,mlx5_1:1" + NCCL_SOCKET_IFNAME: ib0 + NCCL_NET_GDR_LEVEL: "3" +``` + +Write to `src/madengine/configs/env/miopen_defaults.yaml`: + +```yaml +# @package _global_ +env_vars: + MIOPEN_FIND_MODE: "1" + MIOPEN_USER_DB_PATH: /tmp/.miopen +``` + +- [ ] **Step 3: Create tools configs** + +Write to `src/madengine/configs/tools/rocprofv3_lightweight.yaml`: + +```yaml +# @package _global_ +tools: + - name: rocprofv3_lightweight +``` + +Write to `src/madengine/configs/tools/rocprofv3_comprehensive.yaml`: + +```yaml +# @package _global_ +tools: + - name: rocprofv3_full + env_vars: + RCCL_DEBUG: INFO + HSA_ENABLE_SDMA: "0" + - name: gpu_info_power_profiler + env_vars: + POWER_DEVICE: all + POWER_SAMPLING_RATE: "0.1" + POWER_DUAL_GCD: "false" + - name: gpu_info_vram_profiler + env_vars: + VRAM_DEVICE: all + VRAM_SAMPLING_RATE: "0.1" + - name: miopen_trace + - name: rocblas_trace +``` + +Write to `src/madengine/configs/tools/power_profiler.yaml`: + +```yaml +# @package _global_ +tools: + - name: gpu_info_power_profiler + env_vars: + POWER_DEVICE: all + POWER_SAMPLING_RATE: "0.1" + POWER_MODE: power + POWER_DUAL_GCD: "false" + POWER_OUTPUT_FILE: gpu_info_power_profiler_output.csv +``` + +Write to `src/madengine/configs/tools/vram_profiler.yaml`: + +```yaml +# @package _global_ +tools: + - name: gpu_info_vram_profiler + env_vars: + VRAM_DEVICE: all + VRAM_SAMPLING_RATE: "0.1" + VRAM_MODE: vram + VRAM_DUAL_GCD: "false" + VRAM_OUTPUT_FILE: gpu_info_vram_profiler_output.csv +``` + +Write to `src/madengine/configs/tools/rocm_trace_lite.yaml`: + +```yaml +# @package _global_ +tools: + - name: rocm_trace_lite + env_vars: + RTL_MODE: lite +``` + +- [ ] **Step 4: Create data configs** + +Write to `src/madengine/configs/data/local.yaml`: + +```yaml +# @package _global_ +data: + provider: local + path: null +``` + +Write to `src/madengine/configs/data/s3.yaml`: + +```yaml +# @package _global_ +data: + provider: s3 + bucket: null + prefix: null + region: null +``` + +Write to `src/madengine/configs/data/minio.yaml`: + +```yaml +# @package _global_ +data: + provider: minio + endpoint: null + bucket: null + access_key: null + secret_key: null +``` + +Write to `src/madengine/configs/data/nas.yaml`: + +```yaml +# @package _global_ +data: + provider: nas + mount_path: null +``` + +- [ ] **Step 5: Create build configs** + +Write to `src/madengine/configs/build/default.yaml`: + +```yaml +# @package _global_ +build: + registry: null + target_archs: [] + manifest_output: build_manifest.json +``` + +Write to `src/madengine/configs/build/ci.yaml`: + +```yaml +# @package _global_ +docker: + clean_cache: true + +build: + registry: null + target_archs: [] + manifest_output: build_manifest.json +``` + +Write to `src/madengine/configs/build/multi_arch.yaml`: + +```yaml +# @package _global_ +build: + registry: null + target_archs: + - gfx942 + - gfx90a + - gfx908 + manifest_output: build_manifest.json +``` + +- [ ] **Step 6: Verify append-only group composition** + +Run: `python -c " +from hydra import compose, initialize_config_dir +from hydra.core.global_hydra import GlobalHydra +from omegaconf import OmegaConf +import os +GlobalHydra.instance().clear() +config_dir = os.path.abspath('src/madengine/configs') +with initialize_config_dir(config_dir=config_dir, version_base=None): + cfg = compose(config_name='config', overrides=['scheduler=slurm', 'launcher=torchrun', '+profile=mi300x_8gpu', '+env=nccl_debug']) +print(OmegaConf.to_yaml(cfg)) +"` + +Expected: Prints composed config with SLURM scheduler, torchrun launcher, mi300x profile, and NCCL debug env vars all merged. + +- [ ] **Step 7: Commit** + +```bash +git add src/madengine/configs/ +git commit -m "feat(config): add append-only config groups (profile, env, tools, data, build)" +``` + +--- + +### Task 4: Implement HydraConfigLoader + +**Files:** +- Create: `src/madengine/config/__init__.py` +- Create: `src/madengine/config/loader.py` +- Test: `tests/unit/test_hydra_config_loader.py` + +- [ ] **Step 1: Write failing tests for HydraConfigLoader** + +Write to `tests/unit/test_hydra_config_loader.py`: + +```python +#!/usr/bin/env python3 +"""Tests for HydraConfigLoader.""" + +import os +import pytest +import tempfile +from pathlib import Path + +from omegaconf import DictConfig + +from madengine.config.loader import HydraConfigLoader +from madengine.core.errors import ConfigurationError + + +class TestParseArgs: + def test_hydra_overrides_only(self): + user_file, overrides = HydraConfigLoader._parse_args( + ["scheduler=slurm", "distributed.nnodes=4"] + ) + assert user_file is None + assert overrides == ["scheduler=slurm", "distributed.nnodes=4"] + + def test_yaml_file_only(self): + user_file, overrides = HydraConfigLoader._parse_args( + ["/path/to/config.yaml"] + ) + assert user_file == "/path/to/config.yaml" + assert overrides == [] + + def test_yaml_file_with_overrides(self): + user_file, overrides = HydraConfigLoader._parse_args( + ["/path/to/config.yaml", "distributed.nnodes=8"] + ) + assert user_file == "/path/to/config.yaml" + assert overrides == ["distributed.nnodes=8"] + + def test_yml_extension_recognized(self): + user_file, overrides = HydraConfigLoader._parse_args( + ["/path/to/config.yml"] + ) + assert user_file == "/path/to/config.yml" + + def test_multiple_yaml_files_raises(self): + with pytest.raises(ConfigurationError, match="Only one YAML"): + HydraConfigLoader._parse_args( + ["/path/a.yaml", "/path/b.yaml"] + ) + + def test_append_override_not_treated_as_file(self): + user_file, overrides = HydraConfigLoader._parse_args( + ["+profile=mi300x_8gpu"] + ) + assert user_file is None + assert overrides == ["+profile=mi300x_8gpu"] + + def test_empty_args(self): + user_file, overrides = HydraConfigLoader._parse_args([]) + assert user_file is None + assert overrides == [] + + +class TestLoad: + def test_defaults_only(self): + cfg = HydraConfigLoader.load([]) + assert isinstance(cfg, DictConfig) + assert cfg.gpu_vendor == "AMD" + assert cfg.guest_os == "UBUNTU" + assert cfg.distributed.enabled is False + + def test_scheduler_override(self): + cfg = HydraConfigLoader.load(["scheduler=slurm"]) + assert "slurm" in cfg + assert cfg.slurm.partition == "amd-rccl" + + def test_launcher_override(self): + cfg = HydraConfigLoader.load(["launcher=torchrun"]) + assert cfg.distributed.enabled is True + assert cfg.distributed.launcher == "torchrun" + + def test_inline_value_override(self): + cfg = HydraConfigLoader.load( + ["launcher=torchrun", "distributed.nnodes=4"] + ) + assert cfg.distributed.nnodes == 4 + + def test_append_profile(self): + cfg = HydraConfigLoader.load(["+profile=mi300x_8gpu"]) + assert cfg.gpu_type == "mi300x" + assert cfg.distributed.nproc_per_node == 8 + + def test_user_yaml_file(self): + with tempfile.NamedTemporaryFile( + mode="w", suffix=".yaml", delete=False + ) as f: + f.write("debug: true\nenv_vars:\n MY_VAR: hello\n") + f.flush() + try: + cfg = HydraConfigLoader.load([f.name]) + assert cfg.debug is True + assert cfg.env_vars.MY_VAR == "hello" + finally: + os.unlink(f.name) + + def test_user_yaml_with_overrides(self): + with tempfile.NamedTemporaryFile( + mode="w", suffix=".yaml", delete=False + ) as f: + f.write("debug: true\n") + f.flush() + try: + cfg = HydraConfigLoader.load( + [f.name, "scheduler=slurm"] + ) + assert cfg.debug is True + assert "slurm" in cfg + finally: + os.unlink(f.name) + + def test_hardware_nvidia(self): + cfg = HydraConfigLoader.load(["hardware=nvidia"]) + assert cfg.gpu_vendor == "NVIDIA" + assert cfg.runtime.use_gpu_flag is True +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `pytest tests/unit/test_hydra_config_loader.py -v` +Expected: FAIL — `ModuleNotFoundError: No module named 'madengine.config'` + +- [ ] **Step 3: Implement HydraConfigLoader** + +Write to `src/madengine/config/__init__.py`: + +```python +"""Config-driven YAML configuration system for madengine.""" + +from madengine.config.loader import HydraConfigLoader +from madengine.config.translator import ConfigTranslator +from madengine.config.schema import ConfigValidator + + +def load_config(config_args: list) -> tuple: + """Load config from Hydra overrides and/or user YAML file. + + Args: + config_args: List of Hydra overrides and/or a YAML file path. + + Returns: + Tuple of (additional_context dict, metadata dict). + """ + cfg = HydraConfigLoader.load(config_args) + errors = ConfigValidator.validate(cfg) + if errors: + from madengine.core.errors import ConfigurationError + + raise ConfigurationError( + "Config validation errors:\n" + "\n".join(f" - {e}" for e in errors) + ) + return ConfigTranslator.to_additional_context(cfg) +``` + +Write to `src/madengine/config/loader.py`: + +```python +"""Hydra-based config loader using the Compose API.""" + +import importlib.resources +import os +from pathlib import Path + +from hydra import compose, initialize_config_dir +from hydra.core.global_hydra import GlobalHydra +from omegaconf import DictConfig, OmegaConf + +from madengine.core.errors import ConfigurationError + + +class HydraConfigLoader: + """Loads madengine config using Hydra's Compose API.""" + + @staticmethod + def load(config_args: list) -> DictConfig: + """Load and compose config from Hydra overrides and/or user YAML. + + Args: + config_args: Mix of Hydra overrides and optional user YAML path. + + Returns: + Composed DictConfig with all merges applied. + """ + user_file, overrides = HydraConfigLoader._parse_args(config_args) + + config_dir = str( + Path(importlib.resources.files("madengine")) / "configs" + ) + + if not os.path.isdir(config_dir): + config_dir = str( + Path(__file__).parent.parent / "configs" + ) + + GlobalHydra.instance().clear() + + with initialize_config_dir(config_dir=config_dir, version_base=None): + cfg = compose(config_name="config", overrides=overrides) + + if user_file: + user_cfg = OmegaConf.load(user_file) + OmegaConf.set_struct(cfg, False) + cfg = OmegaConf.merge(cfg, user_cfg) + + return cfg + + @staticmethod + def _parse_args(config_args: list) -> tuple: + """Separate user YAML file path from Hydra overrides.""" + user_file = None + overrides = [] + for arg in config_args: + if ( + arg.endswith((".yaml", ".yml")) + and "=" not in arg + and not arg.startswith("+") + ): + if user_file: + raise ConfigurationError( + "Only one YAML config file allowed" + ) + user_file = arg + else: + overrides.append(arg) + return user_file, overrides +``` + +- [ ] **Step 4: Create stub translator and schema so imports resolve** + +Write to `src/madengine/config/translator.py`: + +```python +"""Translates clean YAML config to internal additional_context format.""" + +from omegaconf import DictConfig, OmegaConf + + +class ConfigTranslator: + """Maps YAML config keys to internal additional_context dict format.""" + + KEY_MAP = { + "docker.build_args": "docker_build_arg", + "docker.env_vars": "docker_env_vars", + "docker.mounts": "docker_mounts", + "docker.gpus": "docker_gpus", + "docker.cpus": "docker_cpus", + "docker.additional_run_options": "additional_docker_run_options", + "log_error.pattern_scan": "log_error_pattern_scan", + "log_error.benign_patterns": "log_error_benign_patterns", + "log_error.patterns": "log_error_patterns", + } + + EXTRACTED_KEYS = { + "model", "build", "platform", "output", + "summary_output", "data_config", "live_output", + } + + @classmethod + def to_additional_context(cls, cfg: DictConfig) -> tuple: + """Placeholder — implemented in Task 5.""" + return {}, {} +``` + +Write to `src/madengine/config/schema.py`: + +```python +"""Config validation.""" + +from omegaconf import DictConfig + + +class ConfigValidator: + """Validates composed config for consistency.""" + + @staticmethod + def validate(cfg: DictConfig) -> list: + """Placeholder — implemented in Task 6.""" + return [] +``` + +- [ ] **Step 5: Run tests to verify they pass** + +Run: `pytest tests/unit/test_hydra_config_loader.py -v` +Expected: All tests PASS. + +- [ ] **Step 6: Commit** + +```bash +git add src/madengine/config/ tests/unit/test_hydra_config_loader.py +git commit -m "feat(config): implement HydraConfigLoader with Compose API" +``` + +--- + +### Task 5: Implement ConfigTranslator + +**Files:** +- Modify: `src/madengine/config/translator.py` +- Test: `tests/unit/test_config_translator.py` + +- [ ] **Step 1: Write failing tests for ConfigTranslator** + +Write to `tests/unit/test_config_translator.py`: + +```python +#!/usr/bin/env python3 +"""Tests for ConfigTranslator.""" + +import pytest +from omegaconf import OmegaConf + +from madengine.config.translator import ConfigTranslator + + +def make_cfg(overrides: dict) -> "DictConfig": + """Build a DictConfig from a base + overrides for testing.""" + base = { + "model": {"tags": [], "manifest_file": None, "container_image": None, "skip_run": False, "timeout": None}, + "docker": {"build_args": {}, "env_vars": {}, "mounts": {}, "gpus": None, "cpus": None, "additional_run_options": None, "keep_alive": False, "clean_cache": False}, + "build": {"registry": None, "target_archs": [], "manifest_output": "build_manifest.json"}, + "env_vars": {}, + "debug": False, + "live_output": False, + "log_error": {"pattern_scan": True, "benign_patterns": [], "patterns": []}, + "tools": [], + "pre_scripts": [], + "post_scripts": [], + "encapsulate_script": None, + "data_config": "data.json", + "output": "perf.csv", + "summary_output": None, + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + "runtime": {"devices": [], "capabilities": [], "security_opts": [], "network_mode": "host", "ipc": "host", "groups": [], "use_gpu_flag": False}, + "platform": {"type": "docker"}, + } + merged = {**base, **overrides} + return OmegaConf.create(merged) + + +class TestDockerKeyMapping: + def test_build_args_mapped(self): + cfg = make_cfg({"docker": {"build_args": {"KEY": "val"}, "env_vars": {}, "mounts": {}, "gpus": None, "cpus": None, "additional_run_options": None, "keep_alive": False, "clean_cache": False}}) + ctx, meta = ConfigTranslator.to_additional_context(cfg) + assert ctx["docker_build_arg"] == {"KEY": "val"} + + def test_env_vars_mapped(self): + cfg = make_cfg({"docker": {"build_args": {}, "env_vars": {"A": "1"}, "mounts": {}, "gpus": None, "cpus": None, "additional_run_options": None, "keep_alive": False, "clean_cache": False}}) + ctx, meta = ConfigTranslator.to_additional_context(cfg) + assert ctx["docker_env_vars"] == {"A": "1"} + + def test_null_gpus_excluded(self): + cfg = make_cfg({}) + ctx, meta = ConfigTranslator.to_additional_context(cfg) + assert "docker_gpus" not in ctx + + def test_non_null_gpus_included(self): + cfg = make_cfg({"docker": {"build_args": {}, "env_vars": {}, "mounts": {}, "gpus": "0-3", "cpus": None, "additional_run_options": None, "keep_alive": False, "clean_cache": False}}) + ctx, meta = ConfigTranslator.to_additional_context(cfg) + assert ctx["docker_gpus"] == "0-3" + + +class TestLogErrorMapping: + def test_pattern_scan_mapped(self): + cfg = make_cfg({"log_error": {"pattern_scan": False, "benign_patterns": [], "patterns": []}}) + ctx, meta = ConfigTranslator.to_additional_context(cfg) + assert ctx["log_error_pattern_scan"] is False + + def test_patterns_mapped(self): + cfg = make_cfg({"log_error": {"pattern_scan": True, "benign_patterns": ["OK"], "patterns": ["ERR"]}}) + ctx, meta = ConfigTranslator.to_additional_context(cfg) + assert ctx["log_error_benign_patterns"] == ["OK"] + assert ctx["log_error_patterns"] == ["ERR"] + + +class TestPassthroughKeys: + def test_gpu_vendor_passthrough(self): + cfg = make_cfg({"gpu_vendor": "NVIDIA"}) + ctx, meta = ConfigTranslator.to_additional_context(cfg) + assert ctx["gpu_vendor"] == "NVIDIA" + + def test_env_vars_passthrough(self): + cfg = make_cfg({"env_vars": {"MY": "VAR"}}) + ctx, meta = ConfigTranslator.to_additional_context(cfg) + assert ctx["env_vars"] == {"MY": "VAR"} + + def test_slurm_passthrough(self): + cfg = make_cfg({"slurm": {"partition": "gpu"}}) + ctx, meta = ConfigTranslator.to_additional_context(cfg) + assert ctx["slurm"] == {"partition": "gpu"} + + def test_distributed_passthrough(self): + cfg = make_cfg({"distributed": {"enabled": True, "launcher": "torchrun"}}) + ctx, meta = ConfigTranslator.to_additional_context(cfg) + assert ctx["distributed"]["launcher"] == "torchrun" + + def test_tools_passthrough(self): + cfg = make_cfg({"tools": [{"name": "rpd"}]}) + ctx, meta = ConfigTranslator.to_additional_context(cfg) + assert ctx["tools"] == [{"name": "rpd"}] + + +class TestExtractedKeys: + def test_model_extracted(self): + cfg = make_cfg({"model": {"tags": ["dummy"], "manifest_file": None, "container_image": None, "skip_run": False, "timeout": 300}}) + ctx, meta = ConfigTranslator.to_additional_context(cfg) + assert "model" not in ctx + assert meta["model"]["tags"] == ["dummy"] + assert meta["model"]["timeout"] == 300 + + def test_build_extracted(self): + cfg = make_cfg({"build": {"registry": "myregistry.io", "target_archs": ["gfx942"], "manifest_output": "build_manifest.json"}}) + ctx, meta = ConfigTranslator.to_additional_context(cfg) + assert "build" not in ctx + assert meta["build"]["registry"] == "myregistry.io" + + def test_platform_extracted(self): + cfg = make_cfg({}) + ctx, meta = ConfigTranslator.to_additional_context(cfg) + assert "platform" not in ctx + assert meta["platform"]["type"] == "docker" + + def test_container_image_promoted(self): + cfg = make_cfg({"model": {"tags": [], "manifest_file": None, "container_image": "myimage:latest", "skip_run": False, "timeout": None}}) + ctx, meta = ConfigTranslator.to_additional_context(cfg) + assert ctx["MAD_CONTAINER_IMAGE"] == "myimage:latest" + + def test_runtime_extracted(self): + cfg = make_cfg({}) + ctx, meta = ConfigTranslator.to_additional_context(cfg) + assert "runtime" not in ctx + assert "runtime" in meta +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `pytest tests/unit/test_config_translator.py -v` +Expected: FAIL — translator returns empty dicts. + +- [ ] **Step 3: Implement ConfigTranslator** + +Replace the content of `src/madengine/config/translator.py` with: + +```python +"""Translates clean YAML config to internal additional_context format.""" + +from omegaconf import DictConfig, OmegaConf + + +class ConfigTranslator: + """Maps YAML config keys to internal additional_context dict format.""" + + KEY_MAP = { + "docker.build_args": "docker_build_arg", + "docker.env_vars": "docker_env_vars", + "docker.mounts": "docker_mounts", + "docker.gpus": "docker_gpus", + "docker.cpus": "docker_cpus", + "docker.additional_run_options": "additional_docker_run_options", + "log_error.pattern_scan": "log_error_pattern_scan", + "log_error.benign_patterns": "log_error_benign_patterns", + "log_error.patterns": "log_error_patterns", + } + + EXTRACTED_KEYS = { + "model", "build", "platform", "output", + "summary_output", "data_config", "live_output", + } + + @classmethod + def to_additional_context(cls, cfg: DictConfig) -> tuple: + """Convert DictConfig to (additional_context, metadata) tuple. + + Returns: + additional_context: dict in the format expected by existing pipeline. + metadata: dict with model.tags, build.registry, etc. for the CLI layer. + """ + raw = OmegaConf.to_container(cfg, resolve=True) + + context = {} + metadata = {} + + for key, value in raw.items(): + if key in cls.EXTRACTED_KEYS: + metadata[key] = value + elif key == "docker": + for subkey, subval in value.items(): + internal_key = cls.KEY_MAP.get( + f"docker.{subkey}", f"docker_{subkey}" + ) + if subval is not None: + context[internal_key] = subval + elif key == "log_error": + for subkey, subval in value.items(): + internal_key = cls.KEY_MAP.get( + f"log_error.{subkey}", f"log_error_{subkey}" + ) + context[internal_key] = subval + elif key == "runtime": + metadata["runtime"] = value + else: + if value is not None: + context[key] = value + + model = metadata.get("model", {}) + if model and model.get("container_image"): + context["MAD_CONTAINER_IMAGE"] = model["container_image"] + + return context, metadata +``` + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `pytest tests/unit/test_config_translator.py -v` +Expected: All tests PASS. + +- [ ] **Step 5: Commit** + +```bash +git add src/madengine/config/translator.py tests/unit/test_config_translator.py +git commit -m "feat(config): implement ConfigTranslator key mapping" +``` + +--- + +### Task 6: Implement ConfigValidator + +**Files:** +- Modify: `src/madengine/config/schema.py` +- Test: `tests/unit/test_config_schema.py` + +- [ ] **Step 1: Write failing tests for ConfigValidator** + +Write to `tests/unit/test_config_schema.py`: + +```python +#!/usr/bin/env python3 +"""Tests for ConfigValidator.""" + +import pytest +from omegaconf import OmegaConf + +from madengine.config.schema import ConfigValidator + + +def make_cfg(data: dict) -> "DictConfig": + return OmegaConf.create(data) + + +class TestConflictDetection: + def test_slurm_and_k8s_conflict(self): + cfg = make_cfg({"slurm": {"partition": "gpu"}, "k8s": {"namespace": "default"}}) + errors = ConfigValidator.validate(cfg) + assert any("Cannot specify both" in e for e in errors) + + def test_slurm_only_no_conflict(self): + cfg = make_cfg({"slurm": {"partition": "gpu"}}) + errors = ConfigValidator.validate(cfg) + assert not any("Cannot specify both" in e for e in errors) + + def test_k8s_only_no_conflict(self): + cfg = make_cfg({"k8s": {"namespace": "default"}}) + errors = ConfigValidator.validate(cfg) + assert not any("Cannot specify both" in e for e in errors) + + +class TestDistributedValidation: + def test_enabled_without_launcher(self): + cfg = make_cfg({"distributed": {"enabled": True}}) + errors = ConfigValidator.validate(cfg) + assert any("requires distributed.launcher" in e for e in errors) + + def test_enabled_with_launcher(self): + cfg = make_cfg({"distributed": {"enabled": True, "launcher": "torchrun"}}) + errors = ConfigValidator.validate(cfg) + assert not any("requires distributed.launcher" in e for e in errors) + + def test_invalid_nnodes(self): + cfg = make_cfg({"distributed": {"enabled": True, "launcher": "torchrun", "nnodes": -1}}) + errors = ConfigValidator.validate(cfg) + assert any("positive integer" in e for e in errors) + + def test_valid_nnodes(self): + cfg = make_cfg({"distributed": {"enabled": True, "launcher": "torchrun", "nnodes": 4}}) + errors = ConfigValidator.validate(cfg) + assert not any("positive integer" in e for e in errors) + + +class TestUnknownKeys: + def test_unknown_top_level_key(self): + cfg = make_cfg({"gpu_vendor": "AMD", "typo_key": "oops"}) + errors = ConfigValidator.validate(cfg) + assert any("Unknown config key: 'typo_key'" in e for e in errors) + + def test_known_keys_accepted(self): + cfg = make_cfg({"gpu_vendor": "AMD", "debug": True, "env_vars": {}}) + errors = ConfigValidator.validate(cfg) + assert not any("Unknown config key" in e for e in errors) + + +class TestPlatformValidation: + def test_unsupported_platform(self): + cfg = make_cfg({"platform": {"type": "bare_metal"}}) + errors = ConfigValidator.validate(cfg) + assert any("not yet supported" in e for e in errors) + + def test_docker_platform_ok(self): + cfg = make_cfg({"platform": {"type": "docker"}}) + errors = ConfigValidator.validate(cfg) + assert not any("not yet supported" in e for e in errors) +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `pytest tests/unit/test_config_schema.py -v` +Expected: FAIL — validator returns empty list. + +- [ ] **Step 3: Implement ConfigValidator** + +Replace the content of `src/madengine/config/schema.py` with: + +```python +"""Config validation for composed Hydra configs.""" + +from omegaconf import DictConfig + + +KNOWN_TOP_LEVEL_KEYS = { + "defaults", "platform", "scheduler", "hardware", "launcher", + "model", "docker", "build", "env_vars", "debug", "live_output", + "log_error", "tools", "pre_scripts", "post_scripts", + "encapsulate_script", "data_config", "output", "summary_output", + "gpu_vendor", "guest_os", "runtime", "slurm", "k8s", + "kubernetes", "distributed", "vllm", "sglang_disagg", + "shared_data", "timeout", "gpu_type", "gpu_memory_gb", + "gpus_per_node", "data", +} + +SUPPORTED_PLATFORMS = {"docker"} + + +class ConfigValidator: + """Validates composed config for consistency.""" + + @staticmethod + def validate(cfg: DictConfig) -> list: + """Return list of validation errors (empty = valid).""" + errors = [] + + raw = dict(cfg) if hasattr(cfg, "keys") else {} + + if raw.get("slurm") and raw.get("k8s"): + errors.append( + "Cannot specify both 'slurm' and 'k8s' sections" + ) + + dist = raw.get("distributed") + if isinstance(dist, dict): + if dist.get("enabled") and not dist.get("launcher"): + errors.append( + "distributed.enabled=true requires distributed.launcher" + ) + nnodes = dist.get("nnodes") + if nnodes is not None: + if not isinstance(nnodes, int) or nnodes < 1: + errors.append( + "distributed.nnodes must be a positive integer" + ) + + platform = raw.get("platform") + if isinstance(platform, dict): + ptype = platform.get("type") + if ptype and ptype not in SUPPORTED_PLATFORMS: + errors.append( + f"Platform '{ptype}' is not yet supported. " + f"Supported: {', '.join(sorted(SUPPORTED_PLATFORMS))}" + ) + + for key in raw: + if key not in KNOWN_TOP_LEVEL_KEYS: + errors.append(f"Unknown config key: '{key}'") + + return errors +``` + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `pytest tests/unit/test_config_schema.py -v` +Expected: All tests PASS. + +- [ ] **Step 5: Commit** + +```bash +git add src/madengine/config/schema.py tests/unit/test_config_schema.py +git commit -m "feat(config): implement ConfigValidator with cross-field checks" +``` + +--- + +### Task 7: Integration Test — load_config End-to-End + +**Files:** +- Create: `tests/unit/test_config_integration.py` +- Create: `tests/fixtures/configs/test_slurm_job.yaml` + +- [ ] **Step 1: Create test fixture YAML** + +Write to `tests/fixtures/configs/test_slurm_job.yaml`: + +```yaml +model: + tags: [dummy] + +slurm: + partition: test-partition + nodes: 2 + +distributed: + enabled: true + launcher: torchrun + nnodes: 2 + nproc_per_node: 4 + +env_vars: + MY_VAR: test_value + +debug: true +``` + +- [ ] **Step 2: Write integration tests** + +Write to `tests/unit/test_config_integration.py`: + +```python +#!/usr/bin/env python3 +"""Integration tests for load_config end-to-end pipeline.""" + +import os +import pytest +from pathlib import Path + +from madengine.config import load_config +from madengine.core.errors import ConfigurationError + + +FIXTURES_DIR = Path(__file__).parent.parent / "fixtures" / "configs" + + +class TestLoadConfigEndToEnd: + def test_defaults_produce_valid_context(self): + ctx, meta = load_config([]) + assert ctx["gpu_vendor"] == "AMD" + assert ctx["guest_os"] == "UBUNTU" + assert meta["model"]["tags"] == [] + + def test_scheduler_slurm(self): + ctx, meta = load_config(["scheduler=slurm"]) + assert "slurm" in ctx + assert ctx["slurm"]["partition"] == "amd-rccl" + + def test_launcher_torchrun(self): + ctx, meta = load_config(["launcher=torchrun"]) + assert ctx["distributed"]["enabled"] is True + assert ctx["distributed"]["launcher"] == "torchrun" + + def test_combined_overrides(self): + ctx, meta = load_config([ + "scheduler=slurm", + "launcher=torchrun", + "distributed.nnodes=4", + "+env=nccl_debug", + ]) + assert ctx["distributed"]["nnodes"] == 4 + assert ctx["env_vars"]["NCCL_DEBUG"] == "INFO" + assert "slurm" in ctx + + def test_user_yaml_file(self): + yaml_path = str(FIXTURES_DIR / "test_slurm_job.yaml") + ctx, meta = load_config([yaml_path]) + assert meta["model"]["tags"] == ["dummy"] + assert ctx["slurm"]["partition"] == "test-partition" + assert ctx["distributed"]["nnodes"] == 2 + assert ctx["env_vars"]["MY_VAR"] == "test_value" + assert ctx["debug"] is True + + def test_user_yaml_with_override(self): + yaml_path = str(FIXTURES_DIR / "test_slurm_job.yaml") + ctx, meta = load_config([yaml_path, "distributed.nnodes=8"]) + assert ctx["distributed"]["nnodes"] == 8 + + def test_docker_keys_translated(self): + ctx, meta = load_config(["docker.build_args.KEY=val"]) + assert ctx["docker_build_arg"]["KEY"] == "val" + + def test_slurm_and_k8s_conflict_raises(self): + with pytest.raises(ConfigurationError, match="Cannot specify both"): + load_config(["scheduler=slurm", "k8s.namespace=test"]) + + def test_unsupported_platform_raises(self): + with pytest.raises(ConfigurationError, match="not yet supported"): + load_config(["platform=bare_metal"]) + + def test_container_image_promoted(self): + ctx, meta = load_config( + ["model.container_image=myimage:latest"] + ) + assert ctx["MAD_CONTAINER_IMAGE"] == "myimage:latest" + + def test_model_tags_in_metadata(self): + ctx, meta = load_config(["model.tags=[dummy,bert]"]) + assert meta["model"]["tags"] == ["dummy", "bert"] + assert "model" not in ctx + + def test_profile_append(self): + ctx, meta = load_config(["+profile=mi300x_8gpu"]) + assert ctx["gpu_type"] == "mi300x" + assert ctx["env_vars"]["HSA_ENABLE_SDMA"] == "0" + + def test_tools_append(self): + ctx, meta = load_config(["+tools=rocprofv3_lightweight"]) + assert len(ctx["tools"]) == 1 + assert ctx["tools"][0]["name"] == "rocprofv3_lightweight" +``` + +- [ ] **Step 3: Run integration tests** + +Run: `pytest tests/unit/test_config_integration.py -v` +Expected: All tests PASS. + +- [ ] **Step 4: Commit** + +```bash +git add tests/unit/test_config_integration.py tests/fixtures/configs/ +git commit -m "test(config): add integration tests for load_config pipeline" +``` + +--- + +### Task 8: Integrate --config into CLI run Command + +**Files:** +- Modify: `src/madengine/cli/commands/run.py` + +- [ ] **Step 1: Add --config parameter and merge logic to run command** + +In `src/madengine/cli/commands/run.py`, add the import at the top (after the existing imports, around line 9): + +```python +import ast +``` + +Add the `--config` parameter to the `run` function signature, after the `additional_context_file` parameter (after line 83): + +```python + config: Annotated[ + Optional[List[str]], + typer.Option( + "--config", + help="YAML config file and/or Hydra overrides (e.g., --config my_job.yaml, --config scheduler=slurm launcher=torchrun)", + ), + ] = None, +``` + +After line 165 (`processed_tags = split_comma_separated_tags(tags)`), insert the config loading block: + +```python + # Load --config YAML if provided + if config: + from madengine.config import load_config + + config_ctx, config_meta = load_config(config) + + # Config values provide defaults; explicit CLI args override + if not processed_tags and config_meta.get("model", {}).get("tags"): + processed_tags = config_meta["model"]["tags"] + if timeout == DEFAULT_TIMEOUT and config_meta.get("model", {}).get("timeout"): + timeout = config_meta["model"]["timeout"] + if not manifest_file and config_meta.get("model", {}).get("manifest_file"): + manifest_file = config_meta["model"]["manifest_file"] + if not registry and config_meta.get("build", {}).get("registry"): + registry = config_meta["build"]["registry"] + + # Merge: config is base, --additional-context overrides + parsed_ac = {} + if additional_context and additional_context.strip() != "{}": + try: + parsed_ac = json.loads(additional_context) + except json.JSONDecodeError: + parsed_ac = ast.literal_eval(additional_context) + + def _deep_merge(base: dict, override: dict) -> dict: + result = base.copy() + for k, v in override.items(): + if k in result and isinstance(result[k], dict) and isinstance(v, dict): + result[k] = _deep_merge(result[k], v) + else: + result[k] = v + return result + + merged = _deep_merge(config_ctx, parsed_ac) + additional_context = repr(merged) + additional_context_file = None +``` + +- [ ] **Step 2: Verify the existing test suite still passes** + +Run: `pytest tests/unit/test_cli.py -v` +Expected: All existing tests PASS (backward compatibility preserved). + +- [ ] **Step 3: Commit** + +```bash +git add src/madengine/cli/commands/run.py +git commit -m "feat(config): integrate --config into run command" +``` + +--- + +### Task 9: Integrate --config into CLI build Command + +**Files:** +- Modify: `src/madengine/cli/commands/build.py` + +- [ ] **Step 1: Add --config parameter and merge logic to build command** + +In `src/madengine/cli/commands/build.py`, add the import at the top (after existing imports, around line 9): + +```python +import ast +``` + +Add the `--config` parameter to the `build` function signature, after the `additional_context_file` parameter (after line 71): + +```python + config: Annotated[ + Optional[List[str]], + typer.Option( + "--config", + help="YAML config file and/or Hydra overrides (e.g., --config my_job.yaml, --config scheduler=slurm)", + ), + ] = None, +``` + +After line 104 (`processed_tags = split_comma_separated_tags(tags)`), insert the config loading block: + +```python + # Load --config YAML if provided + if config: + from madengine.config import load_config + + config_ctx, config_meta = load_config(config) + + # Config values provide defaults; explicit CLI args override + if not processed_tags and config_meta.get("model", {}).get("tags"): + processed_tags = config_meta["model"]["tags"] + if not registry and config_meta.get("build", {}).get("registry"): + registry = config_meta["build"]["registry"] + build_meta = config_meta.get("build", {}) + if not target_archs and build_meta.get("target_archs"): + target_archs = build_meta["target_archs"] + + # Merge: config is base, --additional-context overrides + parsed_ac = {} + if additional_context and additional_context.strip() != "{}": + try: + parsed_ac = json.loads(additional_context) + except json.JSONDecodeError: + parsed_ac = ast.literal_eval(additional_context) + + def _deep_merge(base: dict, override: dict) -> dict: + result = base.copy() + for k, v in override.items(): + if k in result and isinstance(result[k], dict) and isinstance(v, dict): + result[k] = _deep_merge(result[k], v) + else: + result[k] = v + return result + + merged = _deep_merge(config_ctx, parsed_ac) + additional_context = repr(merged) + additional_context_file = None +``` + +- [ ] **Step 2: Verify the existing test suite still passes** + +Run: `pytest tests/unit/test_cli.py -v` +Expected: All existing tests PASS. + +- [ ] **Step 3: Commit** + +```bash +git add src/madengine/cli/commands/build.py +git commit -m "feat(config): integrate --config into build command" +``` + +--- + +### Task 10: Extract _deep_merge to Shared Utility + +The `_deep_merge` function is duplicated in both `run.py` and `build.py`. Extract it. + +**Files:** +- Modify: `src/madengine/cli/utils.py` +- Modify: `src/madengine/cli/commands/run.py` +- Modify: `src/madengine/cli/commands/build.py` + +- [ ] **Step 1: Add deep_merge to cli/utils.py** + +At the bottom of `src/madengine/cli/utils.py`, add: + +```python +def deep_merge(base: dict, override: dict) -> dict: + """Recursively merge override into base. Override wins on conflicts.""" + result = base.copy() + for k, v in override.items(): + if k in result and isinstance(result[k], dict) and isinstance(v, dict): + result[k] = deep_merge(result[k], v) + else: + result[k] = v + return result +``` + +- [ ] **Step 2: Update run.py to use shared deep_merge** + +In `src/madengine/cli/commands/run.py`, add `deep_merge` to the import from `..utils`: + +```python +from ..utils import ( + console, + setup_logging, + split_comma_separated_tags, + create_args_namespace, + save_summary_with_feedback, + display_results_table, + display_performance_table, + deep_merge, +) +``` + +Remove the inline `_deep_merge` function definition and replace `_deep_merge(` with `deep_merge(` in the config loading block. + +- [ ] **Step 3: Update build.py to use shared deep_merge** + +In `src/madengine/cli/commands/build.py`, add `deep_merge` to the import from `..utils`: + +```python +from ..utils import ( + console, + setup_logging, + split_comma_separated_tags, + create_args_namespace, + save_summary_with_feedback, + display_results_table, + deep_merge, +) +``` + +Remove the inline `_deep_merge` function definition and replace `_deep_merge(` with `deep_merge(` in the config loading block. + +- [ ] **Step 4: Run all tests** + +Run: `pytest tests/unit/ -v --timeout=60` +Expected: All tests PASS. + +- [ ] **Step 5: Commit** + +```bash +git add src/madengine/cli/utils.py src/madengine/cli/commands/run.py src/madengine/cli/commands/build.py +git commit -m "refactor(config): extract deep_merge to shared utility" +``` + +--- + +### Task 11: Final Verification — Full Test Suite + +**Files:** +- No new files — verification only. + +- [ ] **Step 1: Run the complete unit test suite** + +Run: `pytest tests/unit/ -v --timeout=60` +Expected: All tests PASS including the new config tests. + +- [ ] **Step 2: Run the pre-commit hooks** + +Run: `pre-commit run --all-files` +Expected: All hooks pass (black, isort, flake8). + +- [ ] **Step 3: Verify Hydra config composition end-to-end** + +Run: `python -c " +from madengine.config import load_config +ctx, meta = load_config(['scheduler=slurm', 'launcher=torchrun', '+profile=mi300x_8gpu', '+env=nccl_debug', 'model.tags=[dummy]']) +print('Tags:', meta['model']['tags']) +print('Launcher:', ctx['distributed']['launcher']) +print('Partition:', ctx['slurm']['partition']) +print('NCCL_DEBUG:', ctx['env_vars'].get('NCCL_DEBUG')) +print('GPU type:', ctx.get('gpu_type')) +"` + +Expected output: +``` +Tags: ['dummy'] +Launcher: torchrun +Partition: amd-rccl +NCCL_DEBUG: INFO +GPU type: mi300x +``` + +- [ ] **Step 4: Verify CLI help text includes --config** + +Run: `madengine run --help | grep -A2 "config"` +Expected: Shows `--config` option with help text about YAML config files and Hydra overrides. + +- [ ] **Step 5: Final commit if any formatting fixes were needed** + +```bash +git add -u +git commit -m "style: apply formatting fixes from pre-commit hooks" +``` diff --git a/docs/superpowers/specs/2026-05-02-config-driven-yaml-design.md b/docs/superpowers/specs/2026-05-02-config-driven-yaml-design.md new file mode 100644 index 00000000..8c57d67e --- /dev/null +++ b/docs/superpowers/specs/2026-05-02-config-driven-yaml-design.md @@ -0,0 +1,973 @@ +# Config-Driven YAML System for madengine + +**Date:** 2026-05-02 +**Status:** Draft +**Author:** Stephen Shao + Claude + +## Overview + +Add a `--config` CLI argument to madengine that accepts Hydra-based YAML configuration files with full CLI override support. This replaces the error-prone `--additional-context` JSON string approach with structured, composable YAML configs that can drive the entire workflow from a single file — model selection, deployment target, distributed training, profiling tools, and environment tuning. + +## Goals + +1. Single `--config` argument drives the full madengine workflow (build + run) +2. Hydra config groups for composable deployment configurations +3. CLI override support via dot-path syntax (`distributed.nnodes=4`) +4. Clean, readable YAML keys with a translator to internal format +5. Backward compatible: `--additional-context` still works and overrides `--config` +6. Extensible for future platforms (bare metal, Singularity, Podman) + +## Non-Goals + +- Replacing `models.json` or `data.json` with YAML (they remain as-is) +- Adding Hydra's `@hydra.main` decorator (Typer remains the CLI framework) +- Recipe configs (can be added later as a config group) + +--- + +## Config Directory Structure + +``` +src/madengine/configs/ +├── config.yaml # Root defaults + top-level settings +│ +├── platform/ # WHERE: execution platform +│ ├── docker.yaml # Docker container (default) +│ ├── bare_metal.yaml # Direct execution, no container (future) +│ ├── singularity.yaml # Singularity/Apptainer (future) +│ └── podman.yaml # Podman container (future) +│ +├── scheduler/ # HOW: job scheduling +│ ├── local.yaml # Direct execution on current host (default) +│ ├── slurm.yaml # SLURM HPC cluster +│ └── k8s.yaml # Kubernetes cluster +│ +├── hardware/ # WHAT GPU: vendor + runtime settings +│ ├── amd.yaml # AMD ROCm (default) — vendor, guest_os, +│ │ # device mounts, security opts, renderD +│ ├── nvidia.yaml # NVIDIA CUDA — vendor, --gpus flag +│ └── cpu.yaml # CPU-only — no GPU devices +│ +├── launcher/ # WHAT FRAMEWORK: distributed launcher +│ ├── none.yaml # No distributed launcher (default) +│ ├── torchrun.yaml # PyTorch torchrun +│ ├── deepspeed.yaml # DeepSpeed +│ ├── megatron.yaml # Megatron-LM +│ ├── vllm.yaml # vLLM inference serving +│ ├── sglang.yaml # SGLang inference serving +│ ├── sglang_disagg.yaml # SGLang disaggregated prefill/decode +│ ├── torchtitan.yaml # TorchTitan +│ ├── primus.yaml # Primus launcher +│ └── native.yaml # Native distributed (manual setup) +│ +├── profile/ # OPTIONAL: hardware profiles (+profile=) +│ ├── mi300x_8gpu.yaml +│ ├── mi300x_single.yaml +│ ├── mi250x_4gpu.yaml +│ ├── h100_8gpu.yaml +│ └── a100_8gpu.yaml +│ +├── env/ # OPTIONAL: env var bundles (+env=) +│ ├── nccl_debug.yaml +│ ├── nccl_tuned.yaml +│ ├── infiniband.yaml +│ └── miopen_defaults.yaml +│ +├── tools/ # OPTIONAL: profiling tools (+tools=) +│ ├── rocprofv3_lightweight.yaml +│ ├── rocprofv3_comprehensive.yaml +│ ├── power_profiler.yaml +│ ├── vram_profiler.yaml +│ └── rocm_trace_lite.yaml +│ +├── data/ # OPTIONAL: data provider (+data=) +│ ├── local.yaml # Local filesystem data +│ ├── s3.yaml # AWS S3 data source +│ ├── minio.yaml # MinIO object storage +│ └── nas.yaml # NAS/NFS shared storage +│ +└── build/ # OPTIONAL: build settings (+build=) + ├── default.yaml # Default build settings + ├── ci.yaml # CI pipeline (no cache, strict) + └── multi_arch.yaml # Multi-architecture builds +``` + +**Note:** `platform/` config group stubs (bare_metal, singularity, podman) are created with placeholder content for future extensibility. In Phase 1, only `docker` is functional — the others raise a `ConfigurationError("platform '{name}' is not yet supported")` if selected. + +### Config Group Types + +| Group | Type | Hydra Syntax | Purpose | +|-------|------|-------------|---------| +| `platform` | Default | `platform=docker` | Execution platform | +| `scheduler` | Default | `scheduler=slurm` | Job scheduler | +| `hardware` | Default | `hardware=amd` | GPU vendor + runtime | +| `launcher` | Default | `launcher=torchrun` | Distributed launcher | +| `profile` | Append-only | `+profile=mi300x_8gpu` | Hardware presets | +| `env` | Append-only | `+env=nccl_tuned` | Env var bundles | +| `tools` | Append-only | `+tools=rocprofv3_lightweight` | Profiling tools | +| `data` | Append-only | `+data=local` | Data provider | +| `build` | Append-only | `+build=ci` | Build settings | + +Default groups: exactly one option is selected; changing it replaces the previous selection. +Append-only groups: added on top of existing config via `+` prefix; composable. + +--- + +## YAML Schema + +### Root Config (`config.yaml`) + +```yaml +defaults: + - platform: docker + - scheduler: local + - hardware: amd + - launcher: none + - _self_ + +# Model selection +model: + tags: [] # Model tags to build+run (equivalent to --tags) + manifest_file: null # Use existing manifest (equivalent to --manifest-file) + container_image: null # Skip build, use image (equivalent to MAD_CONTAINER_IMAGE) + skip_run: false # Build only (equivalent to --skip-model-run) + timeout: null # Run timeout in seconds + +# Docker / container settings +docker: + build_args: {} # --build-arg flags + env_vars: {} # --env flags for docker run + mounts: {} # -v host:container volume mounts + gpus: null # GPU device range (auto-detected if null) + cpus: null # CPU affinity (--cpuset-cpus) + additional_run_options: null # Extra docker run flags + keep_alive: false # Keep containers after run + clean_cache: false # Rebuild without cache + +# Build settings +build: + registry: null # Docker registry URL + target_archs: [] # Target GPU architectures for multi-arch + manifest_output: build_manifest.json + +# Environment variables (passed to container/job — separate from docker.env_vars) +env_vars: {} + +# Runtime behavior +debug: false +live_output: false + +# Error scanning +log_error: + pattern_scan: true + benign_patterns: [] + patterns: [] + +# Scripts +tools: [] +pre_scripts: [] +post_scripts: [] +encapsulate_script: null + +# Data +data_config: data.json + +# Output +output: perf.csv +summary_output: null +``` + +### Scheduler Configs + +**`scheduler/local.yaml`:** +```yaml +# @package _global_ +# Local execution — no scheduler-specific config needed +``` + +**`scheduler/slurm.yaml`:** +```yaml +# @package _global_ +slurm: + partition: amd-rccl + nodes: 1 + gpus_per_node: 8 + time: "24:00:00" + output_dir: ./slurm_results + exclusive: true + modules: [] + account: null + qos: null + constraint: null + nodelist: null + exclude: null + results_dir: null + shared_workspace: null + network_interface: null + +env_vars: + OMP_NUM_THREADS: "8" + MIOPEN_FIND_MODE: "1" +``` + +**`scheduler/k8s.yaml`:** +```yaml +# @package _global_ +k8s: + kubeconfig: ~/.kube/config + namespace: default + image_pull_policy: Always + backoff_limit: 3 + ttl_seconds_after_finished: null + allow_privileged_profiling: null + gpu_count: null + gpu_resource_name: amd.com/gpu + memory: null + memory_limit: null + cpu: null + cpu_limit: null + host_ipc: true + node_selector: {} + tolerations: [] + nfs_storage_class: nfs-banff + local_path_storage_class: local-path + data_storage_class: nfs-banff + recreate_shared_data_pvc: false + results_pvc: null + data_pvc: null + output_dir: null + secrets: + strategy: from_local_credentials + image_pull_secret_names: [] + runtime_secret_name: null + +env_vars: + OMP_NUM_THREADS: "8" +``` + +### Hardware Configs + +**`hardware/amd.yaml`:** +```yaml +# @package _global_ +gpu_vendor: AMD +guest_os: UBUNTU + +runtime: + devices: + - /dev/kfd + - /dev/dri + - /dev/infiniband + capabilities: + - SYS_PTRACE + security_opts: + - seccomp=unconfined + network_mode: host + ipc: host + groups: + - video + use_gpu_flag: false +``` + +**`hardware/nvidia.yaml`:** +```yaml +# @package _global_ +gpu_vendor: NVIDIA +guest_os: UBUNTU + +runtime: + devices: [] + capabilities: [] + security_opts: [] + network_mode: host + ipc: host + groups: [] + use_gpu_flag: true +``` + +**`hardware/cpu.yaml`:** +```yaml +# @package _global_ +gpu_vendor: null +guest_os: UBUNTU + +runtime: + devices: [] + capabilities: [] + security_opts: [] + network_mode: null + ipc: null + groups: [] + use_gpu_flag: false +``` + +### Launcher Configs + +**`launcher/none.yaml`:** +```yaml +# @package _global_ +distributed: + enabled: false +``` + +**`launcher/torchrun.yaml`:** +```yaml +# @package _global_ +distributed: + enabled: true + launcher: torchrun + backend: nccl + nnodes: 1 + nproc_per_node: 8 + master_port: 29500 + port: 29500 +``` + +**`launcher/vllm.yaml`:** +```yaml +# @package _global_ +distributed: + enabled: true + launcher: vllm + nnodes: 1 + nproc_per_node: 4 + +vllm: + kv_cache_size: 0.7 + max_model_len: null + tensor_parallel_size: null +``` + +**`launcher/sglang_disagg.yaml`:** +```yaml +# @package _global_ +distributed: + enabled: true + launcher: sglang-disagg + backend: nccl + nnodes: 3 + nproc_per_node: 8 + port: 29500 + +sglang_disagg: + prefill_nodes: null + decode_nodes: null + transfer_backend: mooncake +``` + +**`launcher/deepspeed.yaml`:** +```yaml +# @package _global_ +distributed: + enabled: true + launcher: deepspeed + backend: nccl + nnodes: 1 + nproc_per_node: 8 + master_port: 29500 +``` + +**`launcher/megatron.yaml`:** +```yaml +# @package _global_ +distributed: + enabled: true + launcher: torchrun + backend: nccl + nnodes: 1 + nproc_per_node: 8 + master_port: 29500 +``` + +### Profile Configs (append-only) + +**`profile/mi300x_8gpu.yaml`:** +```yaml +# @package _global_ +# Use: +profile=mi300x_8gpu +# Note: profile keys use gpu_* prefix to avoid collision with hardware/ config group +gpu_type: mi300x +gpu_memory_gb: 192 +gpus_per_node: 8 + +distributed: + nproc_per_node: 8 + +env_vars: + GPU_MAX_HW_QUEUES: "2" + HSA_ENABLE_SDMA: "0" + HSA_FORCE_FINE_GRAIN_PCIE: "1" +``` + +### Env Configs (append-only) + +**`env/infiniband.yaml`:** +```yaml +# @package _global_ +# Use: +env=infiniband +env_vars: + NCCL_IB_DISABLE: "0" + NCCL_IB_HCA: "mlx5_0:1,mlx5_1:1" + NCCL_SOCKET_IFNAME: ib0 + NCCL_NET_GDR_LEVEL: 3 +``` + +**`env/nccl_debug.yaml`:** +```yaml +# @package _global_ +# Use: +env=nccl_debug +env_vars: + NCCL_DEBUG: INFO + NCCL_DEBUG_SUBSYS: "INIT,NET,GRAPH" + TORCH_DISTRIBUTED_DEBUG: DETAIL +``` + +### Tools Configs (append-only) + +**`tools/rocprofv3_comprehensive.yaml`:** +```yaml +# @package _global_ +# Use: +tools=rocprofv3_comprehensive +tools: + - name: rocprofv3_full + env_vars: + RCCL_DEBUG: INFO + HSA_ENABLE_SDMA: "0" + - name: gpu_info_power_profiler + env_vars: + POWER_DEVICE: all + POWER_SAMPLING_RATE: "0.1" + POWER_DUAL_GCD: "false" + - name: gpu_info_vram_profiler + env_vars: + VRAM_DEVICE: all + VRAM_SAMPLING_RATE: "0.1" + - name: miopen_trace + - name: rocblas_trace +``` + +--- + +## Internal Architecture + +### New Module: `src/madengine/config/` + +``` +src/madengine/config/ +├── __init__.py # Public API: load_config() +├── loader.py # HydraConfigLoader: Compose API integration +├── translator.py # Maps clean YAML keys → internal additional_context dict +└── schema.py # Config validation +``` + +### Data Flow + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ CLI Layer │ +│ │ +│ --config file.yaml key=val → config_args: List[str] │ +│ --tags llama3 → tags: List[str] │ +│ --timeout 3600 → timeout: int │ +│ --additional-context '{...}' → additional_context: str │ +│ │ +└──────────────────────┬──────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────┐ +│ HydraConfigLoader.load(config_args) │ +│ │ +│ 1. Separate file path from Hydra overrides │ +│ 2. initialize_config_dir("pkg://madengine.configs") │ +│ 3. compose(config_name="config", overrides=[...]) │ +│ 4. If user YAML file: OmegaConf.merge(cfg, user_cfg) │ +│ 5. Return DictConfig │ +│ │ +└──────────────────────┬──────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────┐ +│ ConfigTranslator.to_additional_context(cfg) │ +│ │ +│ Maps clean YAML keys to internal additional_context format: │ +│ │ +│ YAML Key → Internal Key │ +│ ───────────────────────── ────────────────────── │ +│ docker.build_args → docker_build_arg │ +│ docker.env_vars → docker_env_vars │ +│ docker.mounts → docker_mounts │ +│ docker.gpus → docker_gpus │ +│ docker.cpus → docker_cpus │ +│ docker.additional_run_options → additional_docker_run_options │ +│ model.container_image → MAD_CONTAINER_IMAGE │ +│ log_error.pattern_scan → log_error_pattern_scan │ +│ log_error.benign_patterns → log_error_benign_patterns │ +│ log_error.patterns → log_error_patterns │ +│ runtime.* → (Context runtime settings) │ +│ │ +│ Passthrough keys (no translation): │ +│ gpu_vendor, guest_os, env_vars, tools, pre_scripts, │ +│ post_scripts, encapsulate_script, debug, slurm, k8s, │ +│ distributed, vllm, sglang_disagg, shared_data │ +│ │ +│ Extracted (not in additional_context): │ +│ model.tags → returned separately for orchestrator │ +│ model.manifest_file → returned separately │ +│ model.timeout → returned separately │ +│ build.registry → returned separately │ +│ build.target_archs → returned separately │ +│ │ +│ Returns: (additional_context: dict, metadata: dict) │ +│ │ +└──────────────────────┬──────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────┐ +│ Merge Layer │ +│ │ +│ 1. Start with translated config dict │ +│ 2. CLI args override equivalent config keys: │ +│ --tags provided? → overrides model.tags │ +│ --timeout provided? → overrides model.timeout │ +│ 3. --additional-context merged on top (highest priority) │ +│ 4. Result = final additional_context dict │ +│ │ +└──────────────────────┬──────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────┐ +│ Existing Pipeline (unchanged) │ +│ │ +│ BuildOrchestrator(args) → Context(repr(merged_dict)) │ +│ RunOrchestrator(args) → ContainerRunner / DeploymentFactory │ +│ │ +└─────────────────────────────────────────────────────────────────┘ +``` + +### Merge Precedence (lowest → highest) + +1. **Config group defaults** — `config.yaml` defaults list +2. **Selected config groups** — `scheduler=slurm`, `launcher=torchrun` +3. **Appended config groups** — `+profile=mi300x_8gpu`, `+env=nccl_tuned` +4. **User YAML file** — if `--config /path/to/file.yaml` +5. **Inline Hydra overrides** — `distributed.nnodes=4` +6. **CLI args** — `--tags`, `--timeout` (override equivalent config keys) +7. **`--additional-context`** — highest priority (backward compatibility) + +### HydraConfigLoader + +```python +from hydra import compose, initialize_config_dir +from hydra.core.global_hydra import GlobalHydra +from omegaconf import OmegaConf, DictConfig +from pathlib import Path +import importlib.resources + +class HydraConfigLoader: + """Loads madengine config using Hydra's Compose API.""" + + @staticmethod + def load(config_args: list[str]) -> DictConfig: + """Load and compose config from Hydra overrides and/or user YAML. + + Args: + config_args: Mix of Hydra overrides and optional user YAML path. + Examples: + ["scheduler=slurm", "launcher=torchrun", "distributed.nnodes=4"] + ["/path/to/my_job.yaml"] + ["/path/to/my_job.yaml", "distributed.nnodes=8"] + + Returns: + Composed DictConfig with all merges applied. + """ + user_file, overrides = HydraConfigLoader._parse_args(config_args) + + # Resolve package config directory + config_dir = str( + importlib.resources.files("madengine") / "configs" + ) + + # Clear any previous Hydra state + GlobalHydra.instance().clear() + + with initialize_config_dir( + config_dir=config_dir, version_base=None + ): + cfg = compose(config_name="config", overrides=overrides) + + # Merge user file on top if provided + if user_file: + user_cfg = OmegaConf.load(user_file) + OmegaConf.set_struct(cfg, False) + cfg = OmegaConf.merge(cfg, user_cfg) + + return cfg + + @staticmethod + def _parse_args(config_args: list[str]) -> tuple[str | None, list[str]]: + """Separate user YAML file path from Hydra overrides.""" + user_file = None + overrides = [] + for arg in config_args: + if ( + arg.endswith(('.yaml', '.yml')) + and '=' not in arg + and not arg.startswith('+') + ): + if user_file: + raise ConfigurationError( + "Only one YAML config file allowed" + ) + user_file = arg + else: + overrides.append(arg) + return user_file, overrides +``` + +### ConfigTranslator + +```python +class ConfigTranslator: + """Translates clean YAML config to internal additional_context format.""" + + # YAML key → internal key mapping (only for keys that differ) + KEY_MAP = { + "docker.build_args": "docker_build_arg", + "docker.env_vars": "docker_env_vars", + "docker.mounts": "docker_mounts", + "docker.gpus": "docker_gpus", + "docker.cpus": "docker_cpus", + "docker.additional_run_options": "additional_docker_run_options", + "log_error.pattern_scan": "log_error_pattern_scan", + "log_error.benign_patterns": "log_error_benign_patterns", + "log_error.patterns": "log_error_patterns", + } + + # Keys extracted from config (not part of additional_context) + EXTRACTED_KEYS = { + "model", "build", "platform", "output", + "summary_output", "data_config", "live_output", + } + + @classmethod + def to_additional_context( + cls, cfg: DictConfig + ) -> tuple[dict, dict]: + """Convert DictConfig to (additional_context, metadata) tuple. + + additional_context: dict in the format expected by existing pipeline. + metadata: dict with model.tags, build.registry, etc. for the CLI layer. + """ + raw = OmegaConf.to_container(cfg, resolve=True) + + context = {} + metadata = {} + + for key, value in raw.items(): + if key in cls.EXTRACTED_KEYS: + metadata[key] = value + elif key == "docker": + # Flatten docker.* to docker_* keys + for subkey, subval in value.items(): + internal_key = cls.KEY_MAP.get( + f"docker.{subkey}", f"docker_{subkey}" + ) + if subval is not None: + context[internal_key] = subval + elif key == "log_error": + for subkey, subval in value.items(): + internal_key = cls.KEY_MAP.get( + f"log_error.{subkey}", f"log_error_{subkey}" + ) + context[internal_key] = subval + elif key == "runtime": + # Runtime settings stored separately, applied to Context + metadata["runtime"] = value + else: + # Passthrough: gpu_vendor, guest_os, env_vars, slurm, + # k8s, distributed, tools, pre_scripts, etc. + if value is not None: + context[key] = value + + # Extract MAD_CONTAINER_IMAGE from model metadata + model = metadata.get("model", {}) + if model and model.get("container_image"): + context["MAD_CONTAINER_IMAGE"] = model["container_image"] + + return context, metadata +``` + +### Config Validation (`schema.py`) + +```python +class ConfigValidator: + """Validates composed config for consistency.""" + + @staticmethod + def validate(cfg: DictConfig) -> list[str]: + """Return list of validation errors (empty = valid).""" + errors = [] + + # Cross-field: scheduler=slurm must have slurm section + scheduler = cfg.get("scheduler", {}) + # (Hydra handles this via config group selection) + + # Conflict: can't have both slurm and k8s + if cfg.get("slurm") and cfg.get("k8s"): + errors.append( + "Cannot specify both 'slurm' and 'k8s' sections" + ) + + # Distributed: if enabled, must have launcher + dist = cfg.get("distributed", {}) + if dist.get("enabled") and not dist.get("launcher"): + errors.append( + "distributed.enabled=true requires distributed.launcher" + ) + + # Type checks + if dist.get("nnodes") is not None: + if not isinstance(dist["nnodes"], int) or dist["nnodes"] < 1: + errors.append("distributed.nnodes must be a positive integer") + + # Warn on unknown top-level keys + known_keys = { + "defaults", "platform", "scheduler", "hardware", "launcher", + "model", "docker", "build", "env_vars", "debug", "live_output", + "log_error", "tools", "pre_scripts", "post_scripts", + "encapsulate_script", "data_config", "output", "summary_output", + "gpu_vendor", "guest_os", "runtime", "slurm", "k8s", + "kubernetes", "distributed", "vllm", "sglang_disagg", + "shared_data", "timeout", + } + for key in cfg: + if key not in known_keys: + errors.append(f"Unknown config key: '{key}'") + + return errors +``` + +--- + +## CLI Integration + +### Changes to `commands/run.py` + +```python +def run( + tags: Annotated[...] = [], + # ... existing args ... + config: Annotated[ + Optional[List[str]], + typer.Option( + "--config", + help=( + "YAML config file and/or Hydra overrides. " + "Examples: --config my_job.yaml, " + "--config scheduler=slurm launcher=torchrun, " + "--config my_job.yaml distributed.nnodes=4" + ), + ), + ] = None, + additional_context: Annotated[...] = "{}", + # ... rest of existing args ... +): + if config: + from madengine.config import load_config + config_ctx, config_meta = load_config(config) + + # Extract model selection from config (CLI args override) + if not tags and config_meta.get("model", {}).get("tags"): + tags = config_meta["model"]["tags"] + if timeout == DEFAULT_TIMEOUT and config_meta.get("model", {}).get("timeout"): + timeout = config_meta["model"]["timeout"] + if not manifest_file and config_meta.get("model", {}).get("manifest_file"): + manifest_file = config_meta["model"]["manifest_file"] + if not registry and config_meta.get("build", {}).get("registry"): + registry = config_meta["build"]["registry"] + + # Merge: config_ctx is base, additional_context overrides + parsed_ac = ast.literal_eval(additional_context) if additional_context != "{}" else {} + merged = deep_merge(config_ctx, parsed_ac) + additional_context = repr(merged) + + # ... rest of existing run logic (unchanged) ... +``` + +### Changes to `commands/build.py` + +Same pattern: add `--config` parameter, extract build-relevant metadata, merge with `additional_context`. + +--- + +## Usage Examples + +### Single-file workflow (most common) + +```yaml +# my_slurm_training.yaml +defaults: + - /scheduler: slurm + - /launcher: torchrun + - /hardware: amd + - _self_ + +model: + tags: [megatron_llama3_70b] + +slurm: + partition: gpu-cluster + nodes: 4 + gpus_per_node: 8 + time: "48:00:00" + modules: [rocm/6.2.0] + +distributed: + nnodes: 4 + nproc_per_node: 8 + +env_vars: + NCCL_DEBUG: WARN + GPU_MAX_HW_QUEUES: "2" + HSA_ENABLE_SDMA: "0" +``` + +```bash +madengine run --config my_slurm_training.yaml +``` + +### Config groups + inline overrides (no file) + +```bash +# SLURM multi-node with torchrun +madengine run --config scheduler=slurm launcher=torchrun \ + model.tags=[llama3] distributed.nnodes=4 slurm.partition=gpu-high + +# K8s vLLM inference with profiling +madengine run --config scheduler=k8s launcher=vllm \ + +tools=rocprofv3_lightweight k8s.namespace=ml-inference \ + model.tags=[vllm_llama] + +# Local single-GPU (all defaults, just select model) +madengine run --config model.tags=[dummy] +``` + +### File + overrides + +```bash +# Base config from file, override node count +madengine run --config my_slurm_training.yaml distributed.nnodes=8 + +# Base config + add profiling tools +madengine run --config my_slurm_training.yaml +tools=power_profiler +``` + +### Backward compatible + +```bash +# --additional-context still works, overrides --config +madengine run --config my_slurm_training.yaml \ + --additional-context '{"slurm": {"partition": "override-partition"}}' + +# Pure --additional-context (no --config) still works exactly as before +madengine run --tags dummy -c '{"gpu_vendor": "AMD"}' +``` + +### Future: bare metal + +```bash +# No Docker — direct execution on host +madengine run --config platform=bare_metal scheduler=slurm \ + launcher=torchrun model.tags=[benchmark] +``` + +--- + +## Migration Path + +### Phase 1: Add --config alongside --additional-context +- Both coexist; `--additional-context` has highest priority +- Existing JSON example configs can be converted to YAML (1:1 mapping via translator) +- No breaking changes + +### Phase 2: Convert existing JSON presets to YAML configs +- `deployment/presets/k8s/defaults.json` → `configs/scheduler/k8s.yaml` +- `deployment/presets/slurm/defaults.json` → `configs/scheduler/slurm.yaml` +- `deployment/presets/k8s/profiles/` → `configs/profile/` YAML files +- `examples/profiling-configs/*.json` → `configs/tools/` YAML files +- `examples/k8s-configs/*.json` → example YAML files in `examples/` + +### Phase 3: Deprecate --additional-context (future) +- Emit deprecation warning when `--additional-context` is used +- Eventually remove in a major version + +--- + +## Dependencies + +Add to `pyproject.toml`: + +```toml +dependencies = [ + # ... existing ... + "hydra-core>=1.3", + "omegaconf>=2.3", +] +``` + +Both are pure Python with minimal transitive dependencies. `omegaconf` is already a dependency of `hydra-core`. + +--- + +## Testing Strategy + +### Unit Tests + +- `test_loader.py`: HydraConfigLoader with various override combinations +- `test_translator.py`: ConfigTranslator key mapping, passthrough, extraction +- `test_schema.py`: Validation rules (conflicts, unknown keys, type checks) +- `test_merge.py`: Merge precedence (config < CLI < additional_context) + +### Integration Tests + +- End-to-end: `--config scheduler=slurm` produces correct `additional_context` +- File + overrides: `--config file.yaml key=value` merges correctly +- Backward compat: `--additional-context` without `--config` unchanged +- Both: `--config` + `--additional-context` merges with correct precedence + +### Fixture Configs + +- Add YAML equivalents of existing test fixture JSON files +- Test each config group individually and in combination + +--- + +## Files to Create + +| File | Purpose | +|------|---------| +| `src/madengine/config/__init__.py` | Public API | +| `src/madengine/config/loader.py` | HydraConfigLoader | +| `src/madengine/config/translator.py` | ConfigTranslator | +| `src/madengine/config/schema.py` | ConfigValidator | +| `src/madengine/configs/config.yaml` | Root config | +| `src/madengine/configs/platform/*.yaml` | Platform configs | +| `src/madengine/configs/scheduler/*.yaml` | Scheduler configs | +| `src/madengine/configs/hardware/*.yaml` | Hardware configs | +| `src/madengine/configs/launcher/*.yaml` | Launcher configs | +| `src/madengine/configs/profile/*.yaml` | Hardware profiles | +| `src/madengine/configs/env/*.yaml` | Env var presets | +| `src/madengine/configs/tools/*.yaml` | Profiling tool configs | +| `src/madengine/configs/data/*.yaml` | Data provider configs | +| `src/madengine/configs/build/*.yaml` | Build setting configs | +| `tests/unit/test_config_loader.py` | Loader tests | +| `tests/unit/test_config_translator.py` | Translator tests | +| `tests/unit/test_config_schema.py` | Validation tests | + +## Files to Modify + +| File | Change | +|------|--------| +| `pyproject.toml` | Add hydra-core, omegaconf dependencies | +| `src/madengine/cli/commands/run.py` | Add `--config` parameter, integration logic | +| `src/madengine/cli/commands/build.py` | Add `--config` parameter, integration logic | diff --git a/docs/usage.md b/docs/usage.md index 010a6b6c..b8570ac5 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -27,12 +27,16 @@ madengine discover --tags dummy # Run locally (full workflow: discover/build/run as configured by the model) madengine run --tags dummy -# Or with explicit configuration +# Or with explicit JSON configuration madengine run --tags dummy \ --additional-context '{"gpu_vendor": "AMD", "guest_os": "UBUNTU"}' + +# Or with YAML config (composable, Hydra-based) +madengine run --tags dummy --config scheduler=slurm --config launcher=torchrun +madengine run --config my_job.yaml ``` -> **Note**: `gpu_vendor` defaults to `AMD` and `guest_os` defaults to `UBUNTU` for build operations. For production or non-AMD/Ubuntu environments, specify these values explicitly. +> **Note**: `--config` is mutually exclusive with `--additional-context` / `--additional-context-file`. `gpu_vendor` defaults to `AMD` and `guest_os` defaults to `UBUNTU` for build operations. Results are saved to `perf_entry.csv`. @@ -395,6 +399,8 @@ Deployment target is automatically detected from `slurm` key in configuration. T Use configuration files for complex settings: +**JSON format** (`--additional-context-file`): + **config.json:** ```json { @@ -412,6 +418,42 @@ Use configuration files for complex settings: madengine run --tags model --additional-context-file config.json ``` +**YAML format** (`--config`): + +**my_job.yaml:** +```yaml +model: + tags: [my_model] + timeout: 3600 + +debug: true + +env_vars: + PYTORCH_TUNABLEOP_ENABLED: "1" + HSA_ENABLE_SDMA: "0" + +distributed: + enabled: true + launcher: torchrun + nnodes: 2 + nproc_per_node: 4 +``` + +```bash +madengine run --config my_job.yaml + +# With additional overrides +madengine run --config my_job.yaml --config distributed.nnodes=4 + +# Or use config groups without a file +madengine run --tags model \ + --config scheduler=slurm \ + --config launcher=torchrun \ + --config +profile=mi300x_8gpu +``` + +> `--config` is mutually exclusive with `--additional-context` / `--additional-context-file`. See [Configuration Guide — YAML Configuration](configuration.md#yaml-configuration-config) for config groups and full details, and [`examples/configs/`](../examples/configs/) for annotated templates and ready-to-run demos. + ### Custom Timeouts ```bash diff --git a/examples/build-manifest/batch.json b/examples/build-manifest/batch.json index 8996e43b..16f66d77 100644 --- a/examples/build-manifest/batch.json +++ b/examples/build-manifest/batch.json @@ -21,4 +21,3 @@ "build_new": false } ] - diff --git a/examples/build-manifest/ci_incremental.json b/examples/build-manifest/ci_incremental.json index af83ee86..715a6aed 100644 --- a/examples/build-manifest/ci_incremental.json +++ b/examples/build-manifest/ci_incremental.json @@ -20,4 +20,3 @@ "build_new": false } ] - diff --git a/examples/configs/README.md b/examples/configs/README.md new file mode 100644 index 00000000..8c4e1b80 --- /dev/null +++ b/examples/configs/README.md @@ -0,0 +1,95 @@ +# YAML Config Examples (`--config`) + +``` +configs/ +├── templates/ # Full reference — every field shown and annotated +│ ├── local.yaml +│ ├── slurm.yaml +│ └── k8s.yaml +└── demo/ # Minimal ready-to-run examples organised by target + ├── local/ + ├── slurm/ + └── k8s/ +``` + +## Workflow + +**Starting from scratch** — copy a template, fill in your model tag and cluster +settings, then delete the sections you don't need: + +```bash +cp examples/configs/templates/slurm.yaml my_job.yaml +# edit my_job.yaml … +madengine run --config my_job.yaml +``` + +**Starting from an example** — find the demo closest to your use case and +adapt it: + +```bash +cp examples/configs/demo/slurm/multi-node-torchrun.yaml my_job.yaml +# tweak partition, node count, tags … +madengine run --config my_job.yaml +``` + +**Inline overrides** — any field can be overridden without editing the file: + +```bash +madengine run --config my_job.yaml --config distributed.nnodes=4 +madengine run --config my_job.yaml --config +env=nccl_debug +madengine run --config my_job.yaml --config +tools=rocprofv3_lightweight +``` + +> `--config` is mutually exclusive with `--additional-context` / +> `--additional-context-file`. See `docs/configuration.md` for the full +> field reference. + +--- + +## `templates/` + +| File | Target | Contents | +|------|--------|----------| +| `local.yaml` | Local Docker | All docker, model, tools, scripts, log-error, output fields | +| `slurm.yaml` | SLURM | All slurm, distributed, env_vars, tools, scripts fields | +| `k8s.yaml` | Kubernetes | All k8s, distributed, env_vars, tools, secrets, storage fields | + +## `demo/local/` + +| File | Model | Description | +|------|-------|-------------| +| `single-gpu.yaml` | `dummy` | Single GPU, no distribution | +| `multi-gpu-torchrun.yaml` | `dummy_torchrun` | Single node, 4 GPUs, torchrun | +| `deepspeed.yaml` | `dummy_deepspeed` | DeepSpeed ZeRO, single node | +| `vllm-inference.yaml` | `dummy_vllm` | vLLM tensor parallelism, 4 GPUs | +| `profiling.yaml` | `dummy` | ROCprofv3 + power + VRAM profiling | + +## `demo/slurm/` + +| File | Model | Description | +|------|-------|-------------| +| `single-node-single-gpu.yaml` | `dummy` | Single GPU job | +| `multi-node-torchrun.yaml` | `dummy_torchrun` | 2 nodes × 8 GPUs, Ethernet | +| `multi-node-torchrun-infiniband.yaml` | `dummy_torchrun` | 4 nodes × 8 GPUs, InfiniBand, account/QoS | +| `deepspeed.yaml` | `dummy_deepspeed` | DeepSpeed, single node | +| `megatron-lm.yaml` | `dummy_megatron_lm` | Megatron-LM, 4 nodes × 8 GPUs | +| `torchtitan.yaml` | `dummy_torchtitan` | TorchTitan TP+PP+FSDP2, 4 nodes × 8 GPUs | +| `vllm-inference.yaml` | `dummy_vllm` | vLLM data parallelism, 2 nodes × 4 GPUs | +| `sglang-inference.yaml` | `dummy_sglang` | SGLang, 2 nodes × 4 GPUs | +| `sglang-disagg.yaml` | `dummy_sglang_disagg` | SGLang disaggregated prefill/decode, 5 nodes | +| `profiling-multi-gpu.yaml` | `dummy_torchrun` | torchrun + RCCL + power + VRAM profiling | + +## `demo/k8s/` + +| File | Model | Description | +|------|-------|-------------| +| `single-gpu.yaml` | `dummy` | Single GPU pod | +| `multi-gpu-torchrun.yaml` | `dummy_torchrun` | 1 pod × 8 GPUs, torchrun | +| `multi-node-torchrun.yaml` | `dummy_torchrun` | 2 pods × 8 GPUs, node selector | +| `nvidia-gpu.yaml` | `dummy_torchrun` | NVIDIA A100/H100, `nvidia.com/gpu` | +| `deepspeed.yaml` | `dummy_deepspeed` | DeepSpeed, single pod | +| `megatron-lm.yaml` | `dummy_megatron_lm` | Megatron-LM, 4 pods × 8 GPUs | +| `torchtitan.yaml` | `dummy_torchtitan` | TorchTitan TP+PP+FSDP2, 4 pods × 8 GPUs | +| `vllm-inference.yaml` | `dummy_vllm` | vLLM data parallelism, 2 pods × 4 GPUs | +| `sglang-inference.yaml` | `dummy_sglang` | SGLang, 2 pods × 4 GPUs | +| `sglang-disagg.yaml` | `dummy_sglang_disagg` | SGLang disaggregated, 5 pods | diff --git a/examples/configs/demo/k8s/deepspeed.yaml b/examples/configs/demo/k8s/deepspeed.yaml new file mode 100644 index 00000000..fce15b80 --- /dev/null +++ b/examples/configs/demo/k8s/deepspeed.yaml @@ -0,0 +1,23 @@ +# Kubernetes — DeepSpeed ZeRO distributed training +# madengine run --config examples/configs/demo/k8s/deepspeed.yaml + +model: + tags: [dummy_deepspeed] + +k8s: + namespace: default + gpu_count: 4 + memory: 128Gi + memory_limit: 256Gi + cpu: "32" + cpu_limit: "64" + host_ipc: true + +distributed: + enabled: true + launcher: deepspeed + nnodes: 1 + nproc_per_node: 4 + +env_vars: + OMP_NUM_THREADS: "8" diff --git a/examples/configs/demo/k8s/megatron-lm.yaml b/examples/configs/demo/k8s/megatron-lm.yaml new file mode 100644 index 00000000..c315963a --- /dev/null +++ b/examples/configs/demo/k8s/megatron-lm.yaml @@ -0,0 +1,28 @@ +# Kubernetes — Megatron-LM large-scale transformer training (4 pods × 8 GPUs) +# madengine run --config examples/configs/demo/k8s/megatron-lm.yaml + +model: + tags: [dummy_megatron_lm] + +k8s: + namespace: ml-training + gpu_count: 8 + memory: 256Gi + memory_limit: 512Gi + cpu: "64" + cpu_limit: "128" + host_ipc: true + image_pull_policy: IfNotPresent + node_selector: + feature.node.kubernetes.io/amd-gpu-mi300x: "true" + +distributed: + enabled: true + launcher: megatron + nnodes: 4 + nproc_per_node: 8 + master_port: 29500 + +env_vars: + OMP_NUM_THREADS: "16" + NCCL_DEBUG: INFO diff --git a/examples/configs/demo/k8s/multi-gpu-torchrun.yaml b/examples/configs/demo/k8s/multi-gpu-torchrun.yaml new file mode 100644 index 00000000..03e68264 --- /dev/null +++ b/examples/configs/demo/k8s/multi-gpu-torchrun.yaml @@ -0,0 +1,34 @@ +# Kubernetes — single-node multi-GPU with torchrun (1 pod × 8 GPUs) +# madengine run --config examples/configs/demo/k8s/multi-gpu-torchrun.yaml + +model: + tags: [dummy_torchrun] + +k8s: + namespace: default + gpu_count: 8 + memory: 256Gi + memory_limit: 384Gi + cpu: "64" + cpu_limit: "96" + host_ipc: true + +distributed: + enabled: true + launcher: torchrun + nnodes: 1 + nproc_per_node: 8 + master_port: 29500 + +env_vars: + OMP_NUM_THREADS: "8" + NCCL_DEBUG: WARN + NCCL_IB_DISABLE: "1" + NCCL_SOCKET_IFNAME: eth0 + TORCH_NCCL_HIGH_PRIORITY: "1" + GPU_MAX_HW_QUEUES: "2" + HSA_ENABLE_SDMA: "0" + HSA_FORCE_FINE_GRAIN_PCIE: "1" + MIOPEN_FIND_MODE: "1" + MIOPEN_USER_DB_PATH: /tmp/.miopen + RCCL_ENABLE_HIPGRAPH: "0" diff --git a/examples/configs/demo/k8s/multi-node-torchrun.yaml b/examples/configs/demo/k8s/multi-node-torchrun.yaml new file mode 100644 index 00000000..f1789f2a --- /dev/null +++ b/examples/configs/demo/k8s/multi-node-torchrun.yaml @@ -0,0 +1,39 @@ +# Kubernetes — multi-node torchrun (2 pods × 8 GPUs = 16 GPUs total) +# madengine run --config examples/configs/demo/k8s/multi-node-torchrun.yaml + +model: + tags: [dummy_torchrun] + +k8s: + namespace: default + gpu_count: 8 + memory: 256Gi + memory_limit: 384Gi + cpu: "64" + cpu_limit: "96" + host_ipc: true + node_selector: + feature.node.kubernetes.io/amd-gpu-mi300x: "true" + +distributed: + enabled: true + launcher: torchrun + nnodes: 2 + nproc_per_node: 8 + master_port: 29500 + +env_vars: + OMP_NUM_THREADS: "8" + NCCL_DEBUG: WARN + NCCL_DEBUG_SUBSYS: "INIT,NET" + NCCL_IB_DISABLE: "1" + NCCL_SOCKET_IFNAME: eth0 + TORCH_NCCL_HIGH_PRIORITY: "1" + GPU_MAX_HW_QUEUES: "2" + TORCH_NCCL_ASYNC_ERROR_HANDLING: "1" + NCCL_TIMEOUT: "600" + HSA_ENABLE_SDMA: "0" + HSA_FORCE_FINE_GRAIN_PCIE: "1" + MIOPEN_FIND_MODE: "1" + MIOPEN_USER_DB_PATH: /tmp/.miopen + RCCL_ENABLE_HIPGRAPH: "0" diff --git a/examples/configs/demo/k8s/nvidia-gpu.yaml b/examples/configs/demo/k8s/nvidia-gpu.yaml new file mode 100644 index 00000000..4e518dee --- /dev/null +++ b/examples/configs/demo/k8s/nvidia-gpu.yaml @@ -0,0 +1,32 @@ +# Kubernetes — NVIDIA GPU cluster (A100/H100), single node × 4 GPUs +# madengine run --config examples/configs/demo/k8s/nvidia-gpu.yaml + +model: + tags: [dummy_torchrun] + +gpu_vendor: NVIDIA +guest_os: UBUNTU + +k8s: + namespace: default + gpu_count: 4 + gpu_resource_name: nvidia.com/gpu + memory: 128Gi + memory_limit: 256Gi + cpu: "48" + cpu_limit: "96" + node_selector: + accelerator: nvidia-tesla-a100 + +distributed: + enabled: true + launcher: torchrun + nnodes: 1 + nproc_per_node: 4 + master_port: 29500 + +env_vars: + OMP_NUM_THREADS: "12" + NCCL_DEBUG: WARN + NCCL_IB_DISABLE: "1" + NCCL_SOCKET_IFNAME: eth0 diff --git a/examples/configs/demo/k8s/sglang-disagg.yaml b/examples/configs/demo/k8s/sglang-disagg.yaml new file mode 100644 index 00000000..f9ec614f --- /dev/null +++ b/examples/configs/demo/k8s/sglang-disagg.yaml @@ -0,0 +1,38 @@ +# Kubernetes — SGLang disaggregated inference (5 pods: 1 proxy + 2 prefill + 2 decode) +# madengine run --config examples/configs/demo/k8s/sglang-disagg.yaml +# +# To customize the prefill/decode split, set sglang_disagg.prefill_nodes and +# sglang_disagg.decode_nodes (must sum to nnodes - 1 proxy node). + +model: + tags: [dummy_sglang_disagg] + +k8s: + namespace: default + gpu_count: 8 + memory: 256Gi + memory_limit: 384Gi + cpu: "64" + cpu_limit: "96" + host_ipc: true + node_selector: + feature.node.kubernetes.io/amd-gpu-mi300x: "true" + +distributed: + enabled: true + launcher: sglang-disagg + nnodes: 5 + nproc_per_node: 8 + master_port: 29500 + sglang_disagg: + prefill_nodes: 2 + decode_nodes: 2 + +env_vars: + SGLANG_ALLOW_LONG_MAX_MODEL_LEN: "1" + SGLANG_ENABLE_RADIX_CACHE: "1" + SGLANG_RADIX_CACHE_SIZE: "0.9" + SGLANG_DISAGG_TRANSFER_BACKEND: mooncake + NCCL_TIMEOUT: "600" + RAY_health_check_timeout_ms: "60000" + MOONCAKE_TEST_MODE: "0" diff --git a/examples/configs/demo/k8s/sglang-inference.yaml b/examples/configs/demo/k8s/sglang-inference.yaml new file mode 100644 index 00000000..62c4579f --- /dev/null +++ b/examples/configs/demo/k8s/sglang-inference.yaml @@ -0,0 +1,30 @@ +# Kubernetes — SGLang inference, tensor + data parallelism (2 pods × 4 GPUs) +# madengine run --config examples/configs/demo/k8s/sglang-inference.yaml + +model: + tags: [dummy_sglang] + +k8s: + namespace: default + gpu_count: 4 + memory: 256Gi + memory_limit: 384Gi + cpu: "64" + cpu_limit: "96" + host_ipc: true + node_selector: + feature.node.kubernetes.io/amd-gpu-mi300x: "true" + +distributed: + enabled: true + launcher: sglang + nnodes: 2 + nproc_per_node: 4 + master_port: 29500 + +env_vars: + SGLANG_ALLOW_LONG_MAX_MODEL_LEN: "1" + SGLANG_ENABLE_RADIX_CACHE: "1" + SGLANG_RADIX_CACHE_SIZE: "0.9" + NCCL_TIMEOUT: "600" + RAY_health_check_timeout_ms: "60000" diff --git a/examples/configs/demo/k8s/single-gpu.yaml b/examples/configs/demo/k8s/single-gpu.yaml new file mode 100644 index 00000000..be528719 --- /dev/null +++ b/examples/configs/demo/k8s/single-gpu.yaml @@ -0,0 +1,16 @@ +# Kubernetes — single GPU pod (no distribution) +# madengine run --config examples/configs/demo/k8s/single-gpu.yaml + +model: + tags: [dummy] + +k8s: + namespace: default + gpu_count: 1 + memory: 16Gi + memory_limit: 32Gi + cpu: "8" + cpu_limit: "16" + +env_vars: + OMP_NUM_THREADS: "8" diff --git a/examples/configs/demo/k8s/torchtitan.yaml b/examples/configs/demo/k8s/torchtitan.yaml new file mode 100644 index 00000000..3268915e --- /dev/null +++ b/examples/configs/demo/k8s/torchtitan.yaml @@ -0,0 +1,28 @@ +# Kubernetes — TorchTitan LLM pre-training with TP + PP + FSDP2 (4 pods × 8 GPUs) +# madengine run --config examples/configs/demo/k8s/torchtitan.yaml + +model: + tags: [dummy_torchtitan] + +k8s: + namespace: ml-training + gpu_count: 8 + memory: 512Gi + memory_limit: 768Gi + cpu: "96" + cpu_limit: "128" + host_ipc: true + node_selector: + feature.node.kubernetes.io/amd-gpu-mi300x: "true" + +distributed: + enabled: true + launcher: torchtitan + nnodes: 4 + nproc_per_node: 8 + master_port: 29500 + +env_vars: + PYTORCH_TUNABLEOP_ENABLED: "1" + PYTORCH_TUNABLEOP_TUNING: "1" + NCCL_DEBUG: INFO diff --git a/examples/configs/demo/k8s/vllm-inference.yaml b/examples/configs/demo/k8s/vllm-inference.yaml new file mode 100644 index 00000000..f9a44905 --- /dev/null +++ b/examples/configs/demo/k8s/vllm-inference.yaml @@ -0,0 +1,31 @@ +# Kubernetes — vLLM inference, data parallelism (2 pods × 4 GPUs, one replica per pod) +# madengine run --config examples/configs/demo/k8s/vllm-inference.yaml + +model: + tags: [dummy_vllm] + +k8s: + namespace: default + gpu_count: 4 + memory: 256Gi + memory_limit: 384Gi + cpu: "64" + cpu_limit: "96" + host_ipc: true + node_selector: + feature.node.kubernetes.io/amd-gpu-mi300x: "true" + +distributed: + enabled: true + launcher: vllm + nnodes: 2 + nproc_per_node: 4 + master_port: 29500 + +env_vars: + VLLM_ALLOW_LONG_MAX_MODEL_LEN: "1" + VLLM_WORKER_MULTIPROC_METHOD: spawn + VLLM_KV_CACHE_SIZE: "0.7" + PYTORCH_CUDA_ALLOC_CONF: expandable_segments:True + NCCL_TIMEOUT: "600" + RAY_health_check_timeout_ms: "60000" diff --git a/examples/configs/demo/local/deepspeed.yaml b/examples/configs/demo/local/deepspeed.yaml new file mode 100644 index 00000000..5a3bcd15 --- /dev/null +++ b/examples/configs/demo/local/deepspeed.yaml @@ -0,0 +1,14 @@ +# Local Docker — DeepSpeed ZeRO distributed training +# madengine run --config examples/configs/demo/local/deepspeed.yaml + +model: + tags: [dummy_deepspeed] + +distributed: + enabled: true + launcher: deepspeed + nnodes: 1 + nproc_per_node: 4 + +env_vars: + OMP_NUM_THREADS: "8" diff --git a/examples/configs/demo/local/multi-gpu-torchrun.yaml b/examples/configs/demo/local/multi-gpu-torchrun.yaml new file mode 100644 index 00000000..fe2d5890 --- /dev/null +++ b/examples/configs/demo/local/multi-gpu-torchrun.yaml @@ -0,0 +1,16 @@ +# Local Docker — single node, multi-GPU with torchrun +# madengine run --config examples/configs/demo/local/multi-gpu-torchrun.yaml + +model: + tags: [dummy_torchrun] + +distributed: + enabled: true + launcher: torchrun + nnodes: 1 + nproc_per_node: 4 + master_port: 29500 + +env_vars: + OMP_NUM_THREADS: "8" + NCCL_DEBUG: WARN diff --git a/examples/configs/demo/local/profiling.yaml b/examples/configs/demo/local/profiling.yaml new file mode 100644 index 00000000..37f5ae30 --- /dev/null +++ b/examples/configs/demo/local/profiling.yaml @@ -0,0 +1,19 @@ +# Local Docker — single GPU with ROCm profiling +# madengine run --config examples/configs/demo/local/profiling.yaml + +model: + tags: [dummy] + +tools: + - name: rocprofv3_lightweight + - name: gpu_info_power_profiler + env_vars: + POWER_DEVICE: all + POWER_SAMPLING_RATE: "0.1" + - name: gpu_info_vram_profiler + env_vars: + VRAM_DEVICE: all + VRAM_SAMPLING_RATE: "0.1" + +env_vars: + OMP_NUM_THREADS: "8" diff --git a/examples/configs/demo/local/single-gpu.yaml b/examples/configs/demo/local/single-gpu.yaml new file mode 100644 index 00000000..b2a8e39b --- /dev/null +++ b/examples/configs/demo/local/single-gpu.yaml @@ -0,0 +1,8 @@ +# Local Docker — single GPU, no distribution +# madengine run --config examples/configs/demo/local/single-gpu.yaml + +model: + tags: [dummy] + +env_vars: + OMP_NUM_THREADS: "8" diff --git a/examples/configs/demo/local/vllm-inference.yaml b/examples/configs/demo/local/vllm-inference.yaml new file mode 100644 index 00000000..86e207ea --- /dev/null +++ b/examples/configs/demo/local/vllm-inference.yaml @@ -0,0 +1,16 @@ +# Local Docker — vLLM inference with tensor parallelism +# madengine run --config examples/configs/demo/local/vllm-inference.yaml + +model: + tags: [dummy_vllm] + +distributed: + enabled: true + launcher: vllm + nnodes: 1 + nproc_per_node: 4 + +env_vars: + VLLM_ALLOW_LONG_MAX_MODEL_LEN: "1" + VLLM_WORKER_MULTIPROC_METHOD: spawn + VLLM_KV_CACHE_SIZE: "0.7" diff --git a/examples/configs/demo/slurm/deepspeed.yaml b/examples/configs/demo/slurm/deepspeed.yaml new file mode 100644 index 00000000..01c6c82a --- /dev/null +++ b/examples/configs/demo/slurm/deepspeed.yaml @@ -0,0 +1,21 @@ +# SLURM — DeepSpeed ZeRO distributed training (single node) +# madengine run --config examples/configs/demo/slurm/deepspeed.yaml + +model: + tags: [dummy_deepspeed] + +slurm: + partition: amd-rccl + nodes: 1 + gpus_per_node: 4 + time: "02:00:00" + output_dir: ./slurm_results + +distributed: + enabled: true + launcher: deepspeed + nnodes: 1 + nproc_per_node: 4 + +env_vars: + OMP_NUM_THREADS: "8" diff --git a/examples/configs/demo/slurm/megatron-lm.yaml b/examples/configs/demo/slurm/megatron-lm.yaml new file mode 100644 index 00000000..1ebc2e86 --- /dev/null +++ b/examples/configs/demo/slurm/megatron-lm.yaml @@ -0,0 +1,26 @@ +# SLURM — Megatron-LM large-scale transformer training (4 nodes × 8 GPUs) +# madengine run --config examples/configs/demo/slurm/megatron-lm.yaml + +model: + tags: [dummy_megatron_lm] + +slurm: + partition: gpu + account: research + nodes: 4 + gpus_per_node: 8 + time: "24:00:00" + output_dir: ./slurm_results + exclusive: true + +distributed: + enabled: true + launcher: megatron + nnodes: 4 + nproc_per_node: 8 + master_port: 29500 + +env_vars: + OMP_NUM_THREADS: "16" + NCCL_DEBUG: INFO + NCCL_IB_DISABLE: "0" diff --git a/examples/configs/demo/slurm/multi-node-torchrun-infiniband.yaml b/examples/configs/demo/slurm/multi-node-torchrun-infiniband.yaml new file mode 100644 index 00000000..13350684 --- /dev/null +++ b/examples/configs/demo/slurm/multi-node-torchrun-infiniband.yaml @@ -0,0 +1,40 @@ +# SLURM — multi-node torchrun over InfiniBand (4 nodes × 8 GPUs) +# madengine run --config examples/configs/demo/slurm/multi-node-torchrun-infiniband.yaml + +model: + tags: [dummy_torchrun] + +slurm: + partition: amd-rccl + nodes: 4 + gpus_per_node: 8 + time: "48:00:00" + output_dir: ./slurm_results + exclusive: true + account: my-project + qos: high + network_interface: ib0 + +distributed: + enabled: true + launcher: torchrun + nnodes: 4 + nproc_per_node: 8 + backend: nccl + port: 29500 + +env_vars: + OMP_NUM_THREADS: "16" + NCCL_DEBUG: WARN + NCCL_TIMEOUT: "1200" + TORCH_NCCL_ASYNC_ERROR_HANDLING: "1" + TORCH_NCCL_HIGH_PRIORITY: "1" + GPU_MAX_HW_QUEUES: "2" + HSA_ENABLE_SDMA: "0" + HSA_FORCE_FINE_GRAIN_PCIE: "1" + MIOPEN_FIND_MODE: "1" + MIOPEN_USER_DB_PATH: /tmp/.miopen + RCCL_ENABLE_HIPGRAPH: "0" + NCCL_IB_DISABLE: "0" + NCCL_IB_HCA: "mlx5_0:1,mlx5_1:1" + NCCL_SOCKET_IFNAME: ib0 diff --git a/examples/configs/demo/slurm/multi-node-torchrun.yaml b/examples/configs/demo/slurm/multi-node-torchrun.yaml new file mode 100644 index 00000000..e035474e --- /dev/null +++ b/examples/configs/demo/slurm/multi-node-torchrun.yaml @@ -0,0 +1,36 @@ +# SLURM — multi-node torchrun (2 nodes × 8 GPUs) +# madengine run --config examples/configs/demo/slurm/multi-node-torchrun.yaml + +model: + tags: [dummy_torchrun] + +slurm: + partition: amd-rccl + nodes: 2 + gpus_per_node: 8 + time: "24:00:00" + output_dir: ./slurm_results + exclusive: true + +distributed: + enabled: true + launcher: torchrun + nnodes: 2 + nproc_per_node: 8 + backend: nccl + port: 29500 + +env_vars: + OMP_NUM_THREADS: "8" + NCCL_DEBUG: WARN + NCCL_TIMEOUT: "600" + TORCH_NCCL_ASYNC_ERROR_HANDLING: "1" + TORCH_NCCL_HIGH_PRIORITY: "1" + GPU_MAX_HW_QUEUES: "2" + HSA_ENABLE_SDMA: "0" + HSA_FORCE_FINE_GRAIN_PCIE: "1" + MIOPEN_FIND_MODE: "1" + MIOPEN_USER_DB_PATH: /tmp/.miopen + RCCL_ENABLE_HIPGRAPH: "0" + NCCL_IB_DISABLE: "1" + NCCL_SOCKET_IFNAME: eth0 diff --git a/examples/configs/demo/slurm/profiling-multi-gpu.yaml b/examples/configs/demo/slurm/profiling-multi-gpu.yaml new file mode 100644 index 00000000..2ae4b8a4 --- /dev/null +++ b/examples/configs/demo/slurm/profiling-multi-gpu.yaml @@ -0,0 +1,36 @@ +# SLURM — multi-GPU run with RCCL communication + power + VRAM profiling +# madengine run --config examples/configs/demo/slurm/profiling-multi-gpu.yaml + +model: + tags: [dummy_torchrun] + +slurm: + partition: amd-rccl + nodes: 1 + gpus_per_node: 4 + time: "02:00:00" + output_dir: ./slurm_results + +distributed: + enabled: true + launcher: torchrun + nnodes: 1 + nproc_per_node: 4 + +tools: + - name: rocprofv3_lightweight + - name: rocprofv3_communication + env_vars: + RCCL_DEBUG: INFO + - name: gpu_info_power_profiler + env_vars: + POWER_DEVICE: all + POWER_SAMPLING_RATE: "0.1" + - name: gpu_info_vram_profiler + env_vars: + VRAM_DEVICE: all + VRAM_SAMPLING_RATE: "0.1" + +env_vars: + OMP_NUM_THREADS: "8" + NCCL_DEBUG: WARN diff --git a/examples/configs/demo/slurm/sglang-disagg.yaml b/examples/configs/demo/slurm/sglang-disagg.yaml new file mode 100644 index 00000000..9a17d3ad --- /dev/null +++ b/examples/configs/demo/slurm/sglang-disagg.yaml @@ -0,0 +1,41 @@ +# SLURM — SGLang disaggregated inference (5 nodes: 1 proxy + 2 prefill + 2 decode) +# madengine run --config examples/configs/demo/slurm/sglang-disagg.yaml +# +# To customize the prefill/decode split, set sglang_disagg.prefill_nodes and +# sglang_disagg.decode_nodes (must sum to nnodes - 1 proxy node). + +model: + tags: [dummy_sglang_disagg] + +slurm: + partition: amd-rccl + nodes: 5 + gpus_per_node: 8 + time: "04:00:00" + output_dir: ./slurm_results + exclusive: true + +distributed: + enabled: true + launcher: sglang-disagg + nnodes: 5 + nproc_per_node: 8 + backend: nccl + port: 29500 + sglang_disagg: + prefill_nodes: 2 + decode_nodes: 2 + +env_vars: + SGLANG_ALLOW_LONG_MAX_MODEL_LEN: "1" + SGLANG_ENABLE_RADIX_CACHE: "1" + SGLANG_RADIX_CACHE_SIZE: "0.9" + SGLANG_DISAGG_TRANSFER_BACKEND: mooncake + HSA_FORCE_FINE_GRAIN_PCIE: "1" + HSA_ENABLE_SDMA: "0" + GPU_MAX_HW_QUEUES: "2" + NCCL_DEBUG: WARN + NCCL_IB_DISABLE: "0" + NCCL_IB_HCA: mlx5_0 + NCCL_SOCKET_IFNAME: ib0 + RAY_DEDUP_LOGS: "1" diff --git a/examples/configs/demo/slurm/sglang-inference.yaml b/examples/configs/demo/slurm/sglang-inference.yaml new file mode 100644 index 00000000..0ddbc5f2 --- /dev/null +++ b/examples/configs/demo/slurm/sglang-inference.yaml @@ -0,0 +1,32 @@ +# SLURM — SGLang inference, tensor + data parallelism (2 nodes × 4 GPUs) +# madengine run --config examples/configs/demo/slurm/sglang-inference.yaml + +model: + tags: [dummy_sglang] + +slurm: + partition: amd-rccl + nodes: 2 + gpus_per_node: 4 + time: "04:00:00" + output_dir: ./slurm_results + exclusive: true + +distributed: + enabled: true + launcher: sglang + nnodes: 2 + nproc_per_node: 4 + backend: nccl + port: 29500 + +env_vars: + SGLANG_ALLOW_LONG_MAX_MODEL_LEN: "1" + SGLANG_ENABLE_RADIX_CACHE: "1" + SGLANG_RADIX_CACHE_SIZE: "0.9" + SGLANG_LOGGING_LEVEL: INFO + HSA_FORCE_FINE_GRAIN_PCIE: "1" + HSA_ENABLE_SDMA: "0" + GPU_MAX_HW_QUEUES: "2" + NCCL_DEBUG: WARN + RAY_DEDUP_LOGS: "1" diff --git a/examples/configs/demo/slurm/single-node-single-gpu.yaml b/examples/configs/demo/slurm/single-node-single-gpu.yaml new file mode 100644 index 00000000..29bf1544 --- /dev/null +++ b/examples/configs/demo/slurm/single-node-single-gpu.yaml @@ -0,0 +1,15 @@ +# SLURM — single node, single GPU (no distribution) +# madengine run --config examples/configs/demo/slurm/single-node-single-gpu.yaml + +model: + tags: [dummy] + +slurm: + partition: amd-rccl + nodes: 1 + gpus_per_node: 1 + time: "01:00:00" + output_dir: ./slurm_results + +env_vars: + OMP_NUM_THREADS: "8" diff --git a/examples/configs/demo/slurm/torchtitan.yaml b/examples/configs/demo/slurm/torchtitan.yaml new file mode 100644 index 00000000..eb9233f1 --- /dev/null +++ b/examples/configs/demo/slurm/torchtitan.yaml @@ -0,0 +1,26 @@ +# SLURM — TorchTitan LLM pre-training with TP + PP + FSDP2 (4 nodes × 8 GPUs) +# madengine run --config examples/configs/demo/slurm/torchtitan.yaml + +model: + tags: [dummy_torchtitan] + +slurm: + partition: amd-rccl + nodes: 4 + gpus_per_node: 8 + time: "72:00:00" + output_dir: ./slurm_results + exclusive: true + mem: 512G + constraint: MI300X + +distributed: + enabled: true + launcher: torchtitan + nnodes: 4 + nproc_per_node: 8 + master_port: 29500 + +env_vars: + OMP_NUM_THREADS: "8" + NCCL_DEBUG: WARN diff --git a/examples/configs/demo/slurm/vllm-inference.yaml b/examples/configs/demo/slurm/vllm-inference.yaml new file mode 100644 index 00000000..3d1dcd72 --- /dev/null +++ b/examples/configs/demo/slurm/vllm-inference.yaml @@ -0,0 +1,31 @@ +# SLURM — vLLM inference, data parallelism (2 nodes × 4 GPUs, one replica per node) +# madengine run --config examples/configs/demo/slurm/vllm-inference.yaml + +model: + tags: [dummy_vllm] + +slurm: + partition: amd-rccl + nodes: 2 + gpus_per_node: 4 + time: "04:00:00" + output_dir: ./slurm_results + exclusive: true + enable_node_check: true + +distributed: + enabled: true + launcher: vllm + nnodes: 2 + nproc_per_node: 4 + backend: nccl + port: 29500 + +env_vars: + VLLM_ALLOW_LONG_MAX_MODEL_LEN: "1" + VLLM_WORKER_MULTIPROC_METHOD: spawn + VLLM_KV_CACHE_SIZE: "0.8" + PYTORCH_CUDA_ALLOC_CONF: expandable_segments:True + HSA_FORCE_FINE_GRAIN_PCIE: "1" + NCCL_TIMEOUT: "300" + NCCL_DEBUG: WARN diff --git a/examples/configs/templates/k8s.yaml b/examples/configs/templates/k8s.yaml new file mode 100644 index 00000000..c6dc95ec --- /dev/null +++ b/examples/configs/templates/k8s.yaml @@ -0,0 +1,180 @@ +# ============================================================ +# madengine YAML Config Template — Kubernetes +# ============================================================ +# Copy this file, fill in your model and cluster details, and +# remove or comment out any sections you don't need. +# +# Usage: +# madengine run --config k8s.yaml +# madengine run --config k8s.yaml --config distributed.nnodes=4 +# +# Target is inferred automatically from the presence of the +# 'k8s' (or 'kubernetes') key — no explicit deploy field is needed. +# ============================================================ + +# ------------------------------------ +# Model selection +# ------------------------------------ +model: + # Run by tag — discovers matching models from models.json + tags: [dummy_torchrun] + + # OR point at a pre-built manifest + # manifest_file: build_manifest.json + + # OR use a specific container image directly + # container_image: myrepo/myimage:latest + + skip_run: false + # timeout: 86400 + +# ------------------------------------ +# Kubernetes job settings +# ------------------------------------ +k8s: + kubeconfig: ~/.kube/config # Path to kubeconfig file + namespace: default # Kubernetes namespace for the job + + # GPU resources + gpu_count: 8 # GPUs per pod + gpu_resource_name: amd.com/gpu # Resource name (nvidia.com/gpu for NVIDIA) + + # CPU and memory per pod + memory: 256Gi + memory_limit: 384Gi + cpu: "64" + cpu_limit: "96" + + # Pod lifecycle + image_pull_policy: Always # Always | IfNotPresent | Never + backoff_limit: 3 # Retry attempts on pod failure + # ttl_seconds_after_finished: 3600 # Auto-delete job N seconds after completion + + # IPC namespace sharing (required for multi-GPU NCCL/RCCL) + host_ipc: true + + # Re-create shared data PVC on each run (useful when data changes) + recreate_shared_data_pvc: false + + # Node selection (optional) + node_selector: {} + # feature.node.kubernetes.io/amd-gpu-mi300x: "true" + # topology.kubernetes.io/zone: us-west-2a + + # Tolerations (optional — allow scheduling on tainted nodes) + tolerations: [] + # - key: gpu + # operator: Equal + # value: amd + # effect: NoSchedule + + # Storage (optional) + # results_pvc: my-results-pvc # PVC for writing results (mounted at /results) + # data_pvc: my-data-pvc # Existing PVC for input data (auto-created if omitted) + + # Storage classes (override cluster defaults) + # nfs_storage_class: nfs-banff + # local_path_storage_class: local-path + # data_storage_class: nfs-banff + + # Output dir for generated Kubernetes manifests + # output_dir: ./k8s_manifests + + # Secrets (image pull and runtime credentials) + secrets: + strategy: from_local_credentials # from_local_credentials | from_secret + image_pull_secret_names: [] + # runtime_secret_name: my-runtime-secret + + # Allow privileged containers for profiling tools (e.g. rocprofv3) + # allow_privileged_profiling: true + +# ------------------------------------ +# Distributed launcher +# ------------------------------------ +distributed: + enabled: true + launcher: torchrun # torchrun | deepspeed | megatron | torchtitan | vllm | sglang | sglang-disagg | primus | native + backend: nccl # nccl | gloo + nnodes: 2 # Number of pods (each runs one replica of the job) + nproc_per_node: 8 # Must match k8s.gpu_count + master_port: 29500 + + # SGLang disaggregated only: override default prefill/decode split + # sglang_disagg: + # prefill_nodes: 2 + # decode_nodes: 2 + +# ------------------------------------ +# Environment variables +# ------------------------------------ +env_vars: + OMP_NUM_THREADS: "8" + NCCL_DEBUG: WARN + NCCL_IB_DISABLE: "1" + NCCL_SOCKET_IFNAME: eth0 + TORCH_NCCL_HIGH_PRIORITY: "1" + GPU_MAX_HW_QUEUES: "2" + TORCH_NCCL_ASYNC_ERROR_HANDLING: "1" + NCCL_TIMEOUT: "600" + HSA_ENABLE_SDMA: "0" + HSA_FORCE_FINE_GRAIN_PCIE: "1" + OMP_NUM_THREADS: "8" + MIOPEN_FIND_MODE: "1" + MIOPEN_USER_DB_PATH: /tmp/.miopen + RCCL_ENABLE_HIPGRAPH: "0" + + # Point to the PVC mount for data provider models + # MAD_DATAHOME: /data + +# ------------------------------------ +# Hardware (defaults to AMD/Ubuntu) +# ------------------------------------ +# gpu_vendor: AMD # AMD | NVIDIA +# guest_os: UBUNTU # UBUNTU | CENTOS + +# ------------------------------------ +# Profiling tools (optional) +# Requires allow_privileged_profiling: true in k8s section above +# ------------------------------------ +# tools: [] +# - name: rocprofv3_lightweight +# - name: rocprofv3_communication +# env_vars: +# RCCL_DEBUG: INFO +# - name: gpu_info_power_profiler +# env_vars: +# POWER_DEVICE: all +# POWER_SAMPLING_RATE: "0.1" +# - name: gpu_info_vram_profiler +# env_vars: +# VRAM_DEVICE: all +# VRAM_SAMPLING_RATE: "0.1" + +# ------------------------------------ +# Scripts (optional) +# ------------------------------------ +# pre_scripts: [] +# post_scripts: [] +# encapsulate_script: null + +# ------------------------------------ +# Log error scanning (optional) +# ------------------------------------ +log_error: + pattern_scan: true + benign_patterns: [] + patterns: [] + +# ------------------------------------ +# Output (optional) +# ------------------------------------ +output: perf.csv +# summary_output: null +# data_config: data.json + +# ------------------------------------ +# Misc +# ------------------------------------ +debug: false +live_output: false diff --git a/examples/configs/templates/local.yaml b/examples/configs/templates/local.yaml new file mode 100644 index 00000000..74ce73b4 --- /dev/null +++ b/examples/configs/templates/local.yaml @@ -0,0 +1,137 @@ +# ============================================================ +# madengine YAML Config Template — Local Docker +# ============================================================ +# Copy this file, fill in your model details, and remove or +# comment out any sections you don't need. +# +# Usage: +# madengine run --config local.yaml +# madengine run --config local.yaml --config docker.gpus=0,1 +# +# Note: --config is mutually exclusive with --additional-context. +# ============================================================ + +# ------------------------------------ +# Model selection (pick one approach) +# ------------------------------------ +model: + # Run by tag — discovers matching models from models.json + tags: [dummy] + + # OR point at a pre-built manifest instead of discovering/building + # manifest_file: build_manifest.json + + # OR use a specific container image directly (skips build step) + # container_image: myrepo/myimage:latest + + # Skip the run step (only build) + skip_run: false + + # Per-model timeout in seconds (overrides model's own timeout field) + # timeout: 3600 + +# ------------------------------------ +# Docker options (local runs only) +# ------------------------------------ +docker: + # Extra --build-arg values passed to docker build + build_args: {} + # ROCM_VERSION: "6.2" + # BASE_IMAGE: rocm/pytorch:latest + + # Extra -e values passed to docker run + env_vars: {} + # MY_VAR: my_value + + # Volume mounts: { /path/in/container: /path/on/host } + mounts: {} + # /data: /mnt/shared/datasets + # /results: /home/user/results + + # Override which GPUs to expose (comma-separated indices or "all") + # Defaults to all available GPUs on the host + # gpus: "0,1" + + # Limit CPU cores (docker --cpus) + # cpus: "8" + + # Append arbitrary docker run flags not covered by other options + # additional_run_options: "--shm-size=16g --ulimit memlock=-1" + + # Keep container running after the script exits (useful for debugging) + keep_alive: false + + # Remove Docker build cache before building + clean_cache: false + +# ------------------------------------ +# Distributed launcher (optional) +# ------------------------------------ +# Remove this section entirely for single-GPU, non-distributed runs. +# distributed: +# enabled: true +# launcher: torchrun # torchrun | deepspeed | megatron | torchtitan | vllm | sglang | sglang-disagg | primus | native +# backend: nccl # nccl | gloo +# nnodes: 1 +# nproc_per_node: 4 # GPUs per node +# master_port: 29500 + +# ------------------------------------ +# Environment variables +# ------------------------------------ +# Injected into the container at runtime (separate from docker.env_vars, +# which are passed at docker run rather than derived from context). +env_vars: {} +# OMP_NUM_THREADS: "8" +# NCCL_DEBUG: WARN + +# ------------------------------------ +# Hardware (defaults to AMD/Ubuntu) +# ------------------------------------ +# gpu_vendor: AMD # AMD | NVIDIA +# guest_os: UBUNTU # UBUNTU | CENTOS + +# ------------------------------------ +# Profiling tools (optional) +# ------------------------------------ +# tools: [] +# - name: rocprofv3_lightweight +# - name: gpu_info_power_profiler +# env_vars: +# POWER_DEVICE: all +# POWER_SAMPLING_RATE: "0.1" +# - name: gpu_info_vram_profiler +# env_vars: +# VRAM_DEVICE: all +# VRAM_SAMPLING_RATE: "0.1" +# - name: rocm_trace_lite +# - name: miopen_trace +# - name: rocblas_trace + +# ------------------------------------ +# Scripts (optional) +# ------------------------------------ +# pre_scripts: [] # Run inside the container before the main script +# post_scripts: [] # Run inside the container after the main script +# encapsulate_script: null # Wrap the main script (e.g. a profiler launcher) + +# ------------------------------------ +# Log error scanning (optional) +# ------------------------------------ +log_error: + pattern_scan: true # Scan container output for known error patterns + benign_patterns: [] # Regex patterns to ignore (false-positive suppression) + patterns: [] # Additional error patterns to flag as failures + +# ------------------------------------ +# Output (optional) +# ------------------------------------ +output: perf.csv # Where to write the results CSV +# summary_output: null # Optional JSON summary file +# data_config: data.json # Data provider config file + +# ------------------------------------ +# Misc +# ------------------------------------ +debug: false # Enable verbose debug logging +live_output: false # Stream container stdout/stderr in real time diff --git a/examples/configs/templates/slurm.yaml b/examples/configs/templates/slurm.yaml new file mode 100644 index 00000000..a4c396f8 --- /dev/null +++ b/examples/configs/templates/slurm.yaml @@ -0,0 +1,165 @@ +# ============================================================ +# madengine YAML Config Template — SLURM +# ============================================================ +# Copy this file, fill in your model and cluster details, and +# remove or comment out any sections you don't need. +# +# Usage: +# madengine run --config slurm.yaml +# madengine run --config slurm.yaml --config distributed.nnodes=4 +# +# Target is inferred automatically from the presence of the +# 'slurm' key — no explicit deploy field is needed. +# ============================================================ + +# ------------------------------------ +# Model selection +# ------------------------------------ +model: + # Run by tag — discovers matching models from models.json + tags: [dummy_torchrun] + + # OR point at a pre-built manifest + # manifest_file: build_manifest.json + + # OR use a specific container image directly + # container_image: myrepo/myimage:latest + + skip_run: false + # timeout: 86400 + +# ------------------------------------ +# SLURM job settings +# ------------------------------------ +slurm: + partition: amd-rccl # SLURM partition name (required) + nodes: 2 # Number of nodes to allocate + gpus_per_node: 8 # GPUs per node + time: "24:00:00" # Wall-clock time limit (HH:MM:SS) + output_dir: ./slurm_results # Where SLURM stdout/stderr logs are written + exclusive: true # Request exclusive node access (no sharing) + + # Optional: restrict to a specific set of nodes (disables node health check) + # nodelist: node01,node02 + + # Optional: exclude specific nodes + # exclude: node03,node04 + + # Optional: hardware constraint (e.g. GPU architecture label) + # constraint: MI300X + + # Optional: SLURM account and QoS + # account: my-project + # qos: high + + # Optional: memory limit per node (e.g. "256G") + # mem: 256G + + # Optional: environment modules to load before the job starts + # modules: + # - rocm/6.2.0 + # - gcc/11.2.0 + + # Optional: shared filesystem paths (accessible from all nodes) + # shared_workspace: /lustre/shared/workspace + # results_dir: /lustre/shared/results + + # Optional: network interface for NCCL/RCCL (override auto-detection) + # network_interface: ib0 + + # Optional: node health preflight (GPU memory check before job starts) + enable_node_check: true + auto_cleanup_nodes: false + verbose_node_check: false + +# ------------------------------------ +# Distributed launcher +# ------------------------------------ +distributed: + launcher: torchrun # torchrun | deepspeed | megatron | torchtitan | vllm | sglang | sglang-disagg | primus | native + backend: nccl # nccl | gloo + nnodes: 2 # Must match slurm.nodes + nproc_per_node: 8 # Must match slurm.gpus_per_node + port: 29500 # Master port for distributed rendezvous + + # SGLang disaggregated only: override default prefill/decode split + # sglang_disagg: + # prefill_nodes: 2 + # decode_nodes: 2 + +# ------------------------------------ +# Environment variables +# ------------------------------------ +env_vars: + OMP_NUM_THREADS: "8" + NCCL_DEBUG: WARN + NCCL_DEBUG_SUBSYS: "INIT,NET" + NCCL_TIMEOUT: "600" + TORCH_NCCL_ASYNC_ERROR_HANDLING: "1" + TORCH_NCCL_HIGH_PRIORITY: "1" + GPU_MAX_HW_QUEUES: "2" + HSA_ENABLE_SDMA: "0" + HSA_FORCE_FINE_GRAIN_PCIE: "1" + MIOPEN_FIND_MODE: "1" + MIOPEN_USER_DB_PATH: /tmp/.miopen + RCCL_ENABLE_HIPGRAPH: "0" + + # InfiniBand (uncomment when using IB interconnect) + # NCCL_IB_DISABLE: "0" + # NCCL_IB_HCA: "mlx5_0:1,mlx5_1:1" + # NCCL_SOCKET_IFNAME: ib0 + + # Ethernet (uncomment when using Ethernet interconnect) + # NCCL_IB_DISABLE: "1" + # NCCL_SOCKET_IFNAME: eth0 + +# ------------------------------------ +# Hardware (defaults to AMD/Ubuntu) +# ------------------------------------ +# gpu_vendor: AMD # AMD | NVIDIA +# guest_os: UBUNTU # UBUNTU | CENTOS + +# ------------------------------------ +# Profiling tools (optional) +# ------------------------------------ +# tools: [] +# - name: rocprofv3_lightweight +# - name: rocprofv3_communication +# env_vars: +# RCCL_DEBUG: INFO +# - name: gpu_info_power_profiler +# env_vars: +# POWER_DEVICE: all +# POWER_SAMPLING_RATE: "0.1" +# - name: gpu_info_vram_profiler +# env_vars: +# VRAM_DEVICE: all +# VRAM_SAMPLING_RATE: "0.1" + +# ------------------------------------ +# Scripts (optional) +# ------------------------------------ +# pre_scripts: [] +# post_scripts: [] +# encapsulate_script: null + +# ------------------------------------ +# Log error scanning (optional) +# ------------------------------------ +log_error: + pattern_scan: true + benign_patterns: [] + patterns: [] + +# ------------------------------------ +# Output (optional) +# ------------------------------------ +output: perf.csv +# summary_output: null +# data_config: data.json + +# ------------------------------------ +# Misc +# ------------------------------------ +debug: false +live_output: false diff --git a/examples/k8s-configs/basic/01-native-single-node-single-gpu-tools.json b/examples/k8s-configs/basic/01-native-single-node-single-gpu-tools.json index 8acb9127..a5fa27e4 100644 --- a/examples/k8s-configs/basic/01-native-single-node-single-gpu-tools.json +++ b/examples/k8s-configs/basic/01-native-single-node-single-gpu-tools.json @@ -2,32 +2,31 @@ "_comment": "Single Node, Single GPU with Tools", "_description": "Single GPU configuration with GPU profiling tools", "_use_case": "Single GPU benchmarks with monitoring, no distributed execution", - + "gpu_vendor": "AMD", "guest_os": "UBUNTU", - + "tools": [{ "name": "gpu_info_vram_profiler" }], - + "k8s": { "kubeconfig": "~/.kube/config", "namespace": "default", "gpu_count": 1, - + "memory": "16Gi", "memory_limit": "32Gi", "cpu": "8", "cpu_limit": "16", - + "image_pull_policy": "Always", "backoff_limit": 3 }, - + "env_vars": { "OMP_NUM_THREADS": "8" }, - + "debug": false } - diff --git a/examples/k8s-configs/basic/01-native-single-node-single-gpu.json b/examples/k8s-configs/basic/01-native-single-node-single-gpu.json index 373c8eea..9d7174df 100644 --- a/examples/k8s-configs/basic/01-native-single-node-single-gpu.json +++ b/examples/k8s-configs/basic/01-native-single-node-single-gpu.json @@ -2,27 +2,27 @@ "_comment": "Single Node, Single GPU - Basic Configuration", "_description": "Configuration for running a model on a single GPU in a Kubernetes cluster", "_use_case": "Testing, small models, quick benchmarks (single GPU, no distributed execution)", - + "gpu_vendor": "AMD", "guest_os": "UBUNTU", - + "k8s": { "kubeconfig": "~/.kube/config", "namespace": "default", "gpu_count": 1, - + "memory": "16Gi", "memory_limit": "32Gi", "cpu": "8", "cpu_limit": "16", - + "image_pull_policy": "Always", "backoff_limit": 3 }, - + "env_vars": { "OMP_NUM_THREADS": "8" }, - + "debug": false } diff --git a/examples/k8s-configs/basic/02-torchrun-single-node-multi-gpu-tools.json b/examples/k8s-configs/basic/02-torchrun-single-node-multi-gpu-tools.json index 3c5f80ae..695c6804 100644 --- a/examples/k8s-configs/basic/02-torchrun-single-node-multi-gpu-tools.json +++ b/examples/k8s-configs/basic/02-torchrun-single-node-multi-gpu-tools.json @@ -2,26 +2,26 @@ "_comment": "Single Node, Multiple GPUs (2 GPUs) with Tools", "_description": "2 GPU configuration with torchrun and GPU profiling tools", "_use_case": "Multi-GPU training with performance monitoring on busy clusters", - + "gpu_vendor": "AMD", "guest_os": "UBUNTU", - + "tools": [{"name": "gpu_info_power_profiler"}, {"name": "gpu_info_vram_profiler"}, {"name": "rocprof"}, {"name": "rpd"}, {"name": "miopen_trace"}, {"name": "rocblas_trace"}, {"name": "tensile_trace"}], - + "k8s": { "kubeconfig": "~/.kube/config", "namespace": "default", "gpu_count": 2, - + "memory": "64Gi", "memory_limit": "128Gi", "cpu": "16", "cpu_limit": "32", - + "image_pull_policy": "Always", "backoff_limit": 3 }, - + "distributed": { "enabled": true, "backend": "nccl", @@ -30,7 +30,7 @@ "nproc_per_node": 2, "master_port": 29500 }, - + "env_vars": { "NCCL_DEBUG": "WARN", "NCCL_IB_DISABLE": "1", @@ -44,7 +44,7 @@ "HSA_FORCE_FINE_GRAIN_PCIE": "1", "RCCL_ENABLE_HIPGRAPH": "0" }, - + "_env_var_notes": { "NCCL_DEBUG": "Changed from INFO to WARN to reduce log verbosity", "MIOPEN_FIND_MODE": "1 = Use compiled kernels, avoid find-db warnings", @@ -52,6 +52,6 @@ "HSA_FORCE_FINE_GRAIN_PCIE": "Helps with IOMMU-related warnings in containers", "RCCL_ENABLE_HIPGRAPH": "Disable for compatibility (experimental feature)" }, - + "debug": false } diff --git a/examples/k8s-configs/basic/02-torchrun-single-node-multi-gpu.json b/examples/k8s-configs/basic/02-torchrun-single-node-multi-gpu.json index be0d7c5e..d0a7ebf5 100644 --- a/examples/k8s-configs/basic/02-torchrun-single-node-multi-gpu.json +++ b/examples/k8s-configs/basic/02-torchrun-single-node-multi-gpu.json @@ -2,25 +2,25 @@ "_comment": "Single Node, Multiple GPUs (2 GPUs) - Multi-GPU Testing", "_description": "Configuration for running a model on 2 GPUs on a single node with torchrun", "_use_case": "Multi-GPU training and testing on busy clusters", - + "gpu_vendor": "AMD", "guest_os": "UBUNTU", - + "k8s": { "kubeconfig": "~/.kube/config", "namespace": "default", "gpu_count": 2, - + "memory": "64Gi", "memory_limit": "128Gi", "cpu": "16", "cpu_limit": "32", - + "image_pull_policy": "Always", "backoff_limit": 3, "recreate_shared_data_pvc": true }, - + "distributed": { "enabled": true, "backend": "nccl", @@ -29,7 +29,7 @@ "nproc_per_node": 2, "master_port": 29500 }, - + "env_vars": { "NCCL_DEBUG": "WARN", "NCCL_IB_DISABLE": "1", @@ -43,7 +43,7 @@ "HSA_FORCE_FINE_GRAIN_PCIE": "1", "RCCL_ENABLE_HIPGRAPH": "0" }, - + "_env_var_notes": { "NCCL_DEBUG": "Changed from INFO to WARN to reduce log verbosity", "MIOPEN_FIND_MODE": "1 = Use compiled kernels, avoid find-db warnings", @@ -52,6 +52,6 @@ "RCCL_ENABLE_HIPGRAPH": "Disable for compatibility (experimental feature)", "NCCL_MIN_NCHANNELS": "Removed (warning says ignored for <8 GPUs)" }, - + "debug": false } diff --git a/examples/k8s-configs/basic/03-torchrun-multi-node-basic.json b/examples/k8s-configs/basic/03-torchrun-multi-node-basic.json index 0c2205f9..bf3a34af 100644 --- a/examples/k8s-configs/basic/03-torchrun-multi-node-basic.json +++ b/examples/k8s-configs/basic/03-torchrun-multi-node-basic.json @@ -2,26 +2,26 @@ "_comment": "Multi-Node (2 nodes, 2 GPUs each) - Basic Configuration", "_description": "Configuration for distributed workload across 2 nodes with 2 GPUs per node (4 GPUs total)", "_use_case": "Multi-node distributed execution testing on busy clusters", - + "gpu_vendor": "AMD", "guest_os": "UBUNTU", - + "k8s": { "kubeconfig": "~/.kube/config", "namespace": "default", "gpu_count": 2, - + "memory": "64Gi", "memory_limit": "128Gi", "cpu": "16", "cpu_limit": "32", - + "image_pull_policy": "Always", "backoff_limit": 3, "host_ipc": true, "recreate_shared_data_pvc": true }, - + "distributed": { "enabled": true, "backend": "nccl", @@ -30,7 +30,7 @@ "nproc_per_node": 2, "master_port": 29500 }, - + "env_vars": { "NCCL_DEBUG": "WARN", "NCCL_DEBUG_SUBSYS": "INIT,NET", @@ -48,7 +48,7 @@ "HSA_NO_SCRATCH_RECLAIM": "1", "RCCL_ENABLE_HIPGRAPH": "0" }, - + "_env_var_notes": { "NCCL_DEBUG": "Changed to WARN for cleaner logs (use INFO for debugging)", "MIOPEN_FIND_MODE": "1 = Use compiled kernels, avoid find-db warnings", @@ -57,6 +57,6 @@ "RCCL_ENABLE_HIPGRAPH": "Disable for compatibility", "NCCL_MIN_NCHANNELS": "Removed (warning says ignored for <8 GPUs)" }, - + "debug": false } diff --git a/examples/k8s-configs/basic/04-torchrun-multi-node-advanced.json b/examples/k8s-configs/basic/04-torchrun-multi-node-advanced.json index 5560ffab..577ed424 100644 --- a/examples/k8s-configs/basic/04-torchrun-multi-node-advanced.json +++ b/examples/k8s-configs/basic/04-torchrun-multi-node-advanced.json @@ -2,31 +2,31 @@ "_comment": "Multi-Node (4 nodes, 2 GPUs each) - Advanced Configuration", "_description": "Full-featured configuration for large-scale distributed workloads with PVCs, tolerations, and node affinity", "_use_case": "Multi-node distributed execution with advanced features on busy clusters (8 GPUs total)", - + "gpu_vendor": "AMD", "guest_os": "UBUNTU", - + "k8s": { "kubeconfig": "~/.kube/config", "namespace": "ml-training", "gpu_count": 2, "gpu_resource_name": "amd.com/gpu", - + "memory": "128Gi", "memory_limit": "192Gi", "cpu": "24", "cpu_limit": "32", - + "image_pull_policy": "IfNotPresent", "backoff_limit": 5, "host_ipc": true, - + "node_selector": { "feature.node.kubernetes.io/amd-gpu-mi300x": "true", "topology.kubernetes.io/zone": "us-west-2a", "workload-type": "ml-training" }, - + "tolerations": [ { "key": "gpu", @@ -41,13 +41,13 @@ "effect": "NoSchedule" } ], - + "results_pvc": "ml-results-pvc", "data_pvc": "ml-datasets-pvc", - + "output_dir": "./k8s_manifests/multi-node" }, - + "distributed": { "enabled": true, "backend": "nccl", @@ -56,7 +56,7 @@ "nproc_per_node": 2, "master_port": 29500 }, - + "env_vars": { "NCCL_DEBUG": "WARN", "NCCL_DEBUG_SUBSYS": "INIT,NET,GRAPH", @@ -73,7 +73,7 @@ "MIOPEN_USER_DB_PATH": "/tmp/.miopen", "RCCL_ENABLE_HIPGRAPH": "0" }, - + "_env_var_notes": { "NCCL_DEBUG": "Changed to WARN for cleaner logs (use INFO for debugging)", "MIOPEN_FIND_MODE": "1 = Use compiled kernels, avoid find-db warnings", @@ -82,6 +82,6 @@ "RCCL_ENABLE_HIPGRAPH": "Disable for compatibility", "NCCL_MIN_NCHANNELS": "Removed (warning says ignored for <8 GPUs)" }, - + "debug": false } diff --git a/examples/k8s-configs/basic/05-torchrun-nvidia-gpu-example.json b/examples/k8s-configs/basic/05-torchrun-nvidia-gpu-example.json index 7c087acc..14665bdb 100644 --- a/examples/k8s-configs/basic/05-torchrun-nvidia-gpu-example.json +++ b/examples/k8s-configs/basic/05-torchrun-nvidia-gpu-example.json @@ -2,29 +2,29 @@ "_comment": "NVIDIA GPU - Single Node, 4 GPUs", "_description": "Configuration for running models on NVIDIA GPUs (A100, H100, etc.) with distributed execution", "_use_case": "NVIDIA-based Kubernetes clusters, multi-GPU training", - + "gpu_vendor": "NVIDIA", "guest_os": "UBUNTU", - + "k8s": { "kubeconfig": "~/.kube/config", "namespace": "default", "gpu_count": 4, "gpu_resource_name": "nvidia.com/gpu", - + "memory": "128Gi", "memory_limit": "256Gi", "cpu": "48", "cpu_limit": "96", - + "image_pull_policy": "Always", "backoff_limit": 3, - + "node_selector": { "accelerator": "nvidia-tesla-a100" } }, - + "distributed": { "enabled": true, "backend": "nccl", @@ -33,7 +33,7 @@ "nproc_per_node": 4, "master_port": 29500 }, - + "env_vars": { "NCCL_DEBUG": "INFO", "NCCL_IB_DISABLE": "1", @@ -42,6 +42,6 @@ "NCCL_P2P_LEVEL": "NVL", "OMP_NUM_THREADS": "12" }, - + "debug": false } diff --git a/examples/k8s-configs/basic/06-data-provider-with-pvc.json b/examples/k8s-configs/basic/06-data-provider-with-pvc.json index 9bd2e47f..a00b3eb9 100644 --- a/examples/k8s-configs/basic/06-data-provider-with-pvc.json +++ b/examples/k8s-configs/basic/06-data-provider-with-pvc.json @@ -3,50 +3,50 @@ "_description": "Production-ready setup for training with external data (MinIO, S3, NAS, etc.)", "_use_case": "Models that require data provider (e.g., dummy_torchrun_data_minio)", "_auto_pvc": "✅ PVC is automatically created - NO manual kubectl commands needed!", - + "gpu_vendor": "AMD", "guest_os": "UBUNTU", - + "k8s": { "kubeconfig": "~/.kube/config", "namespace": "default", "gpu_count": 2, - + "_comment_pvc": "OPTIONAL - Leave empty for auto-creation (recommended)", "_pvc_auto": "Auto-created: madengine-shared-data (100Gi, RWO/RWX based on nnodes)", "_pvc_custom": "To use existing PVC: uncomment and set: \"data_pvc\": \"your-pvc-name\"", - + "memory": "64Gi", "memory_limit": "128Gi", "cpu": "16", "cpu_limit": "32", - + "image_pull_policy": "Always", "backoff_limit": 3, "host_ipc": true }, - + "distributed": { "enabled": true, "backend": "nccl", "launcher": "torchrun", - + "_comment_single_node": "For single-node: nnodes=1, nproc_per_node=N_GPUs", "_comment_multi_node": "For multi-node: nnodes=N, nproc_per_node=GPUs_per_node", "nnodes": 1, "nproc_per_node": 2, - + "master_port": 29500 }, - + "env_vars": { "_comment_mad_datahome": "MAD_DATAHOME points to PVC mount point (default: /data)", "MAD_DATAHOME": "/data", - + "_comment_nccl": "NCCL/RCCL configuration for AMD GPUs", "NCCL_DEBUG": "WARN", "TORCH_NCCL_ASYNC_ERROR_HANDLING": "1", - + "_comment_rocm": "ROCm optimizations", "HSA_FORCE_FINE_GRAIN_PCIE": "1", "HSA_ENABLE_SDMA": "0", @@ -54,13 +54,13 @@ "MIOPEN_USER_DB_PATH": "/tmp/.miopen", "RCCL_ENABLE_HIPGRAPH": "0" }, - + "_quick_start": { "step_1": "Build: madengine build --tags dummy_torchrun_data_minio --additional-context-file THIS_FILE", "step_2": "Run: madengine run --manifest-file build_manifest.json", "result": "✅ PVC auto-created, data downloaded, training started - all automatic!" }, - + "_how_it_works": { "auto_pvc": "madengine creates 'madengine-shared-data' PVC automatically if not found", "reusable": "PVC persists across runs - data downloads once, reuses forever", @@ -68,13 +68,12 @@ "verify": "kubectl get pvc madengine-shared-data", "inspect": "kubectl describe pvc madengine-shared-data" }, - + "_advanced": { "custom_pvc": "To use existing PVC: Add \"data_pvc\": \"your-pvc-name\" to k8s config above", "storage_class": "Auto-PVC uses cluster's default storage class", "pvc_size": "Default 100Gi - modify code in kubernetes.py if needed" }, - + "debug": false } - diff --git a/examples/k8s-configs/basic/megatron-lm-multi-node-basic.json b/examples/k8s-configs/basic/megatron-lm-multi-node-basic.json index e059ba08..30a4377c 100644 --- a/examples/k8s-configs/basic/megatron-lm-multi-node-basic.json +++ b/examples/k8s-configs/basic/megatron-lm-multi-node-basic.json @@ -3,10 +3,10 @@ "_description": "Large-scale transformer training with Megatron-LM on Kubernetes", "_use_case": "Multi-node Megatron-LM training with tensor and pipeline parallelism", "_reference": "https://github.com/NVIDIA/Megatron-LM", - + "gpu_vendor": "AMD", "guest_os": "UBUNTU", - + "k8s": { "gpu_count": 8, "namespace": "ml-training", @@ -16,19 +16,18 @@ "cpu_limit": "64", "image_pull_policy": "IfNotPresent" }, - + "distributed": { "launcher": "megatron", "nnodes": 4, "nproc_per_node": 8, "master_port": 29500 }, - + "env_vars": { "OMP_NUM_THREADS": "16", "NCCL_DEBUG": "INFO" }, - + "debug": false } - diff --git a/examples/k8s-configs/basic/sglang-disagg-custom-split.json b/examples/k8s-configs/basic/sglang-disagg-custom-split.json index 49aeecb1..41a26b55 100644 --- a/examples/k8s-configs/basic/sglang-disagg-custom-split.json +++ b/examples/k8s-configs/basic/sglang-disagg-custom-split.json @@ -10,10 +10,10 @@ "total": "7 pods total", "note": "Custom split overrides default 40/60 ratio" }, - + "gpu_vendor": "AMD", "guest_os": "UBUNTU", - + "k8s": { "gpu_count": 8, "memory": "256Gi", @@ -24,7 +24,7 @@ "feature.node.kubernetes.io/amd-gpu-mi300x": "true" } }, - + "distributed": { "launcher": "sglang-disagg", "nnodes": 7, @@ -35,7 +35,7 @@ "decode_nodes": 2 } }, - + "context": { "env_vars": { "SGLANG_ENABLE_RADIX_CACHE": "1", @@ -45,4 +45,3 @@ } } } - diff --git a/examples/k8s-configs/basic/sglang-disagg-multi-node-basic.json b/examples/k8s-configs/basic/sglang-disagg-multi-node-basic.json index c16fd342..b632d49b 100644 --- a/examples/k8s-configs/basic/sglang-disagg-multi-node-basic.json +++ b/examples/k8s-configs/basic/sglang-disagg-multi-node-basic.json @@ -9,10 +9,10 @@ "decode": "Pods 3-4 (2 nodes, ~60%)", "total": "5 pods total" }, - + "gpu_vendor": "AMD", "guest_os": "UBUNTU", - + "k8s": { "gpu_count": 8, "memory": "256Gi", @@ -23,14 +23,14 @@ "feature.node.kubernetes.io/amd-gpu-mi300x": "true" } }, - + "distributed": { "launcher": "sglang-disagg", "nnodes": 5, "nproc_per_node": 8, "master_port": 29500 }, - + "context": { "env_vars": { "SGLANG_ENABLE_RADIX_CACHE": "1", @@ -41,4 +41,3 @@ } } } - diff --git a/examples/k8s-configs/basic/sglang-multi-node-basic.json b/examples/k8s-configs/basic/sglang-multi-node-basic.json index b693260e..4c4fe1ce 100644 --- a/examples/k8s-configs/basic/sglang-multi-node-basic.json +++ b/examples/k8s-configs/basic/sglang-multi-node-basic.json @@ -3,10 +3,10 @@ "_description": "Multi-node SGLang with native launcher and Ray", "_use_case": "Distributed LLM inference serving", "_reference": "https://github.com/sgl-project/sglang", - + "gpu_vendor": "AMD", "guest_os": "UBUNTU", - + "k8s": { "gpu_count": 4, "memory": "256Gi", @@ -17,14 +17,14 @@ "feature.node.kubernetes.io/amd-gpu-mi300x": "true" } }, - + "distributed": { "launcher": "sglang", "nnodes": 2, "nproc_per_node": 4, "master_port": 29500 }, - + "context": { "env_vars": { "SGLANG_KV_CACHE_SIZE": "0.5", @@ -33,4 +33,3 @@ } } } - diff --git a/examples/k8s-configs/basic/torchtitan-multi-node-basic.json b/examples/k8s-configs/basic/torchtitan-multi-node-basic.json index e350605d..0cd41bfe 100644 --- a/examples/k8s-configs/basic/torchtitan-multi-node-basic.json +++ b/examples/k8s-configs/basic/torchtitan-multi-node-basic.json @@ -3,10 +3,10 @@ "_description": "Uses multi-dimensional parallelism (TP + PP + FSDP2)", "_use_case": "Large-scale LLM pre-training (70B+ models)", "_reference": "https://github.com/pytorch/torchtitan", - + "gpu_vendor": "AMD", "guest_os": "UBUNTU", - + "k8s": { "gpu_count": 8, "memory": "512Gi", @@ -17,14 +17,14 @@ "feature.node.kubernetes.io/amd-gpu-mi300x": "true" } }, - + "distributed": { "launcher": "torchtitan", "nnodes": 4, "nproc_per_node": 8, "master_port": 29500 }, - + "context": { "pre_scripts": [ "scripts/common/setup_pytorch_env.sh" @@ -36,4 +36,3 @@ } } } - diff --git a/examples/k8s-configs/basic/vllm-multi-node-basic.json b/examples/k8s-configs/basic/vllm-multi-node-basic.json index 4c1b61c9..67e191d1 100644 --- a/examples/k8s-configs/basic/vllm-multi-node-basic.json +++ b/examples/k8s-configs/basic/vllm-multi-node-basic.json @@ -3,10 +3,10 @@ "_description": "Each pod runs independent vLLM replica for higher throughput", "_use_case": "High-throughput LLM inference serving", "_reference": "https://github.com/vllm-project/vllm", - + "gpu_vendor": "AMD", "guest_os": "UBUNTU", - + "k8s": { "gpu_count": 4, "memory": "256Gi", @@ -17,14 +17,14 @@ "feature.node.kubernetes.io/amd-gpu-mi300x": "true" } }, - + "distributed": { "launcher": "vllm", "nnodes": 2, "nproc_per_node": 4, "master_port": 29500 }, - + "context": { "env_vars": { "VLLM_KV_CACHE_SIZE": "0.5", @@ -35,4 +35,3 @@ } } } - diff --git a/examples/k8s-configs/minimal/custom-namespace-minimal.json b/examples/k8s-configs/minimal/custom-namespace-minimal.json index fa3747dd..bac0ef06 100644 --- a/examples/k8s-configs/minimal/custom-namespace-minimal.json +++ b/examples/k8s-configs/minimal/custom-namespace-minimal.json @@ -2,14 +2,13 @@ "_comment": "Minimal Config with Custom Namespace", "_description": "Shows how to override specific defaults", "_use_case": "Deploying to a specific namespace", - + "gpu_vendor": "AMD", "guest_os": "UBUNTU", - + "k8s": { "gpu_count": 1, "namespace": "default", "memory": "32Gi" } } - diff --git a/examples/k8s-configs/minimal/deepspeed-minimal.json b/examples/k8s-configs/minimal/deepspeed-minimal.json index 7bece847..7fdcad8c 100644 --- a/examples/k8s-configs/minimal/deepspeed-minimal.json +++ b/examples/k8s-configs/minimal/deepspeed-minimal.json @@ -2,23 +2,23 @@ "_comment": "DeepSpeed Minimal Config - Uses bash script with torchrun", "_description": "DeepSpeed with ZeRO-1 optimization", "_use_case": "Test DeepSpeed distributed training with bash wrapper", - + "gpu_vendor": "AMD", "guest_os": "UBUNTU", - + "k8s": { "gpu_count": 2, "namespace": "default", "memory": "32Gi", "cpu": "16" }, - + "distributed": { "launcher": "deepspeed", "nnodes": 1, "nproc_per_node": 2 }, - + "env_vars": { "OMP_NUM_THREADS": "8" } diff --git a/examples/k8s-configs/minimal/megatron-lm-exclude-node.json b/examples/k8s-configs/minimal/megatron-lm-exclude-node.json index 793431a2..6571b445 100644 --- a/examples/k8s-configs/minimal/megatron-lm-exclude-node.json +++ b/examples/k8s-configs/minimal/megatron-lm-exclude-node.json @@ -3,40 +3,39 @@ "_description": "Use this if you need to explicitly exclude a node with disk pressure or other issues", "_use_case": "Temporary config to avoid problematic nodes during maintenance", "_note": "This uses anti-affinity to exclude banff-pla-r25-05. Update the hostname as needed.", - + "gpu_vendor": "AMD", "guest_os": "UBUNTU", - + "k8s": { "gpu_count": 2, "namespace": "default", - + "memory": "32Gi", "memory_limit": "128Gi", "cpu": "16", "cpu_limit": "32", - + "image_pull_policy": "IfNotPresent", - + "node_selector": { "feature.node.kubernetes.io/amd-gpu": "true" } }, - + "distributed": { "launcher": "megatron", "nnodes": 1, "nproc_per_node": 2 }, - + "env_vars": { "OMP_NUM_THREADS": "8" }, - + "_instructions": [ "To exclude a specific node, add node affinity in the deployment code,", "or temporarily drain the node: kubectl drain banff-pla-r25-05 --ignore-daemonsets", "This config ensures scheduling only on nodes with AMD GPUs" ] } - diff --git a/examples/k8s-configs/minimal/megatron-lm-minimal.json b/examples/k8s-configs/minimal/megatron-lm-minimal.json index 43266e01..86f3db49 100644 --- a/examples/k8s-configs/minimal/megatron-lm-minimal.json +++ b/examples/k8s-configs/minimal/megatron-lm-minimal.json @@ -2,23 +2,23 @@ "_comment": "Megatron-LM Minimal Config - Dedicated launcher support", "_description": "Megatron-LM with automated tensor/pipeline parallelism setup", "_use_case": "Large-scale transformer training with Megatron-LM on Kubernetes", - + "gpu_vendor": "AMD", "guest_os": "UBUNTU", - + "k8s": { "gpu_count": 2, "namespace": "default", "memory": "32Gi", "cpu": "16" }, - + "distributed": { "launcher": "megatron", "nnodes": 1, "nproc_per_node": 2 }, - + "env_vars": { "OMP_NUM_THREADS": "8" } diff --git a/examples/k8s-configs/minimal/megatron-lm-optimized.json b/examples/k8s-configs/minimal/megatron-lm-optimized.json index 29559308..d25eadec 100644 --- a/examples/k8s-configs/minimal/megatron-lm-optimized.json +++ b/examples/k8s-configs/minimal/megatron-lm-optimized.json @@ -2,30 +2,30 @@ "_comment": "Optimized Megatron-LM Configuration with Node Selector", "_description": "Production-ready configuration with resource management and node selection", "_use_case": "Megatron-LM training with automatic node selection to avoid problematic nodes", - + "gpu_vendor": "AMD", "guest_os": "UBUNTU", - + "k8s": { "gpu_count": 2, "namespace": "default", - + "memory": "32Gi", "memory_limit": "128Gi", "cpu": "16", "cpu_limit": "32", - + "image_pull_policy": "IfNotPresent", "backoff_limit": 3, - + "node_selector": { "feature.node.kubernetes.io/amd-gpu": "true", "amd.com/gpu.product-name": "AMD_Instinct_MI300X_OAM" }, - + "tolerations": [] }, - + "distributed": { "enabled": true, "backend": "nccl", @@ -34,7 +34,7 @@ "nproc_per_node": 2, "master_port": 29500 }, - + "env_vars": { "OMP_NUM_THREADS": "8", "NCCL_DEBUG": "WARN", @@ -47,7 +47,6 @@ "HSA_FORCE_FINE_GRAIN_PCIE": "1", "RCCL_ENABLE_HIPGRAPH": "0" }, - + "debug": false } - diff --git a/examples/k8s-configs/minimal/sglang-disagg-minimal.json b/examples/k8s-configs/minimal/sglang-disagg-minimal.json index f0f6ad05..c1683f1c 100644 --- a/examples/k8s-configs/minimal/sglang-disagg-minimal.json +++ b/examples/k8s-configs/minimal/sglang-disagg-minimal.json @@ -2,18 +2,17 @@ "_comment": "Minimal SGLang Disaggregated configuration - 3 nodes minimum", "_description": "SGLang disaggregated inference with 3 pods (1 proxy + 1 prefill + 1 decode)", "_architecture": "Pod 0: Proxy, Pod 1: Prefill, Pod 2: Decode", - + "gpu_vendor": "AMD", "guest_os": "UBUNTU", - + "k8s": { "gpu_count": 3 }, - + "distributed": { "launcher": "sglang-disagg", "nnodes": 3, "nproc_per_node": 1 } } - diff --git a/examples/k8s-configs/minimal/sglang-single-node-minimal.json b/examples/k8s-configs/minimal/sglang-single-node-minimal.json index 5a12b19d..b3f0e297 100644 --- a/examples/k8s-configs/minimal/sglang-single-node-minimal.json +++ b/examples/k8s-configs/minimal/sglang-single-node-minimal.json @@ -3,26 +3,25 @@ "_description": "SGLang inference with Tensor Parallelism for single-node", "_use_case": "LLM inference serving with SGLang", "_reference": "https://github.com/sgl-project/sglang", - + "gpu_vendor": "AMD", "guest_os": "UBUNTU", - + "k8s": { "gpu_count": 4, "memory": "128Gi", "cpu": "32" }, - + "distributed": { "launcher": "sglang", "nnodes": 1, "nproc_per_node": 4 }, - + "context": { "env_vars": { "SGLANG_KV_CACHE_SIZE": "0.7" } } } - diff --git a/examples/k8s-configs/minimal/torchrun-multi-gpu-minimal.json b/examples/k8s-configs/minimal/torchrun-multi-gpu-minimal.json index 49a2ebbf..f5a34635 100644 --- a/examples/k8s-configs/minimal/torchrun-multi-gpu-minimal.json +++ b/examples/k8s-configs/minimal/torchrun-multi-gpu-minimal.json @@ -2,18 +2,17 @@ "_comment": "Minimal Multi-GPU Config - 2 GPUs with torchrun", "_description": "Uses built-in defaults for AMD multi-GPU optimizations", "_use_case": "Quick multi-GPU training with minimal configuration", - + "gpu_vendor": "AMD", "guest_os": "UBUNTU", - + "k8s": { "gpu_count": 2 }, - + "distributed": { "launcher": "torchrun", "nnodes": 1, "nproc_per_node": 2 } } - diff --git a/examples/k8s-configs/minimal/torchrun-multi-node-minimal.json b/examples/k8s-configs/minimal/torchrun-multi-node-minimal.json index 656ac123..bf194838 100644 --- a/examples/k8s-configs/minimal/torchrun-multi-node-minimal.json +++ b/examples/k8s-configs/minimal/torchrun-multi-node-minimal.json @@ -2,18 +2,17 @@ "_comment": "Minimal Multi-Node Config - 2 nodes x 2 GPUs each", "_description": "Uses built-in defaults for multi-node distributed workload", "_use_case": "Quick multi-node testing with 4 GPUs total", - + "gpu_vendor": "AMD", "guest_os": "UBUNTU", - + "k8s": { "gpu_count": 2 }, - + "distributed": { "launcher": "torchrun", "nnodes": 2, "nproc_per_node": 2 } } - diff --git a/examples/k8s-configs/minimal/torchrun-nvidia-gpu-minimal.json b/examples/k8s-configs/minimal/torchrun-nvidia-gpu-minimal.json index 444e037f..2a6e39c5 100644 --- a/examples/k8s-configs/minimal/torchrun-nvidia-gpu-minimal.json +++ b/examples/k8s-configs/minimal/torchrun-nvidia-gpu-minimal.json @@ -2,18 +2,17 @@ "_comment": "Minimal NVIDIA GPU Config - 4 GPUs with torchrun", "_description": "Uses built-in NVIDIA optimizations and presets", "_use_case": "Quick NVIDIA GPU testing with minimal configuration", - + "gpu_vendor": "NVIDIA", "guest_os": "UBUNTU", - + "k8s": { "gpu_count": 4 }, - + "distributed": { "launcher": "torchrun", "nnodes": 1, "nproc_per_node": 4 } } - diff --git a/examples/k8s-configs/minimal/torchrun-single-gpu-minimal.json b/examples/k8s-configs/minimal/torchrun-single-gpu-minimal.json index 5041003e..3ed65bdb 100644 --- a/examples/k8s-configs/minimal/torchrun-single-gpu-minimal.json +++ b/examples/k8s-configs/minimal/torchrun-single-gpu-minimal.json @@ -2,18 +2,17 @@ "_comment": "Minimal Single GPU Config - Only Essential Fields", "_description": "Uses built-in defaults for everything except GPU count", "_use_case": "Quick single GPU testing with minimal configuration", - + "gpu_vendor": "AMD", "guest_os": "UBUNTU", - + "k8s": { "gpu_count": 1 }, - + "distributed": { "launcher": "torchrun", "nnodes": 1, "nproc_per_node": 1 } } - diff --git a/examples/k8s-configs/minimal/torchtitan-single-node-minimal.json b/examples/k8s-configs/minimal/torchtitan-single-node-minimal.json index 9605f09c..577ea998 100644 --- a/examples/k8s-configs/minimal/torchtitan-single-node-minimal.json +++ b/examples/k8s-configs/minimal/torchtitan-single-node-minimal.json @@ -3,20 +3,19 @@ "_description": "Uses torchtitan with Tensor Parallelism for single-node training", "_use_case": "Quick LLM pre-training with torchtitan (8B model)", "_reference": "https://github.com/pytorch/torchtitan", - + "gpu_vendor": "AMD", "guest_os": "UBUNTU", - + "k8s": { "gpu_count": 8, "memory": "256Gi", "cpu": "64" }, - + "distributed": { "launcher": "torchtitan", "nnodes": 1, "nproc_per_node": 8 } } - diff --git a/examples/k8s-configs/minimal/vllm-single-node-minimal.json b/examples/k8s-configs/minimal/vllm-single-node-minimal.json index ed0de4ac..102db25e 100644 --- a/examples/k8s-configs/minimal/vllm-single-node-minimal.json +++ b/examples/k8s-configs/minimal/vllm-single-node-minimal.json @@ -3,22 +3,22 @@ "_description": "vLLM inference with Tensor Parallelism for single-node", "_use_case": "LLM inference serving with vLLM", "_reference": "https://github.com/vllm-project/vllm", - + "gpu_vendor": "AMD", "guest_os": "UBUNTU", - + "k8s": { "gpu_count": 4, "memory": "128Gi", "cpu": "32" }, - + "distributed": { "launcher": "vllm", "nnodes": 1, "nproc_per_node": 4 }, - + "context": { "env_vars": { "VLLM_KV_CACHE_SIZE": "0.7", @@ -26,4 +26,3 @@ } } } - diff --git a/examples/profiling-configs/rocprofv3_multi_gpu.json b/examples/profiling-configs/rocprofv3_multi_gpu.json index 2b2b250d..a830e78b 100644 --- a/examples/profiling-configs/rocprofv3_multi_gpu.json +++ b/examples/profiling-configs/rocprofv3_multi_gpu.json @@ -3,10 +3,10 @@ "_description": "Configuration for distributed training on 4 GPUs with ROCm profiling tools on SLURM", "_use_case": "Multi-GPU training with communication profiling, power monitoring, and VRAM tracking", "_note": "Using 'amd-rccl' partition. Change to your cluster's partition name if different (e.g., 'gpu', 'compute').", - + "gpu_vendor": "AMD", "guest_os": "UBUNTU", - + "slurm": { "partition": "amd-rccl", "nodes": 1, @@ -15,12 +15,12 @@ "output_dir": "./slurm_results", "exclusive": false }, - + "distributed": { "launcher": "torchrun", "nproc_per_node": 4 }, - + "tools": [ { "name": "rocprofv3_communication", @@ -43,10 +43,10 @@ } } ], - + "env_vars": { "OMP_NUM_THREADS": "8" }, - + "debug": false } diff --git a/examples/profiling-configs/rocprofv3_multi_node.json b/examples/profiling-configs/rocprofv3_multi_node.json index ad87c814..e648bef2 100644 --- a/examples/profiling-configs/rocprofv3_multi_node.json +++ b/examples/profiling-configs/rocprofv3_multi_node.json @@ -3,10 +3,10 @@ "_description": "Configuration for distributed training across multiple nodes with ROCm profiling tools on SLURM", "_use_case": "Large-scale multi-node training with communication profiling, power monitoring, and VRAM tracking", "_note": "Using 'amd-rccl' partition. Change to your cluster's partition name if different (e.g., 'gpu', 'compute').", - + "gpu_vendor": "AMD", "guest_os": "UBUNTU", - + "slurm": { "partition": "amd-rccl", "nodes": 2, @@ -15,13 +15,13 @@ "output_dir": "./slurm_results", "exclusive": true }, - + "distributed": { "launcher": "torchrun", "nnodes": 2, "nproc_per_node": 4 }, - + "tools": [ { "name": "rocprofv3_communication", @@ -45,12 +45,12 @@ } } ], - + "env_vars": { "OMP_NUM_THREADS": "8", "NCCL_IB_DISABLE": "0", "NCCL_SOCKET_IFNAME": "eth0" }, - + "debug": false } diff --git a/examples/slurm-configs/basic/01-single-node-single-gpu.json b/examples/slurm-configs/basic/01-single-node-single-gpu.json index c0877717..5e2f073b 100644 --- a/examples/slurm-configs/basic/01-single-node-single-gpu.json +++ b/examples/slurm-configs/basic/01-single-node-single-gpu.json @@ -3,10 +3,10 @@ "_description": "Configuration for running a model on a single GPU on a SLURM cluster", "_use_case": "Testing, small models, quick benchmarks (single GPU, no distributed execution)", "_note": "Using 'amd-rccl' partition. Change to your cluster's partition name if different (e.g., 'gpu', 'compute').", - + "gpu_vendor": "AMD", "guest_os": "UBUNTU", - + "slurm": { "partition": "amd-rccl", "nodes": 1, @@ -15,11 +15,10 @@ "output_dir": "./slurm_results", "exclusive": false }, - + "env_vars": { "OMP_NUM_THREADS": "8" }, - + "debug": false } - diff --git a/examples/slurm-configs/basic/02-single-node-multi-gpu.json b/examples/slurm-configs/basic/02-single-node-multi-gpu.json index a0e5b6ae..0763127c 100644 --- a/examples/slurm-configs/basic/02-single-node-multi-gpu.json +++ b/examples/slurm-configs/basic/02-single-node-multi-gpu.json @@ -3,10 +3,10 @@ "_description": "Configuration for running a model on 8 GPUs on a single SLURM node", "_use_case": "Single-node distributed workload, large models requiring multiple GPUs", "_note": "Using 'amd-rccl' partition. Change to your cluster's partition name if different.", - + "gpu_vendor": "AMD", "guest_os": "UBUNTU", - + "slurm": { "partition": "amd-rccl", "nodes": 1, @@ -15,19 +15,18 @@ "output_dir": "./slurm_results", "exclusive": true }, - + "distributed": { "launcher": "torchrun", "backend": "nccl", "nnodes": 1, "nproc_per_node": 8 }, - + "env_vars": { "OMP_NUM_THREADS": "8", "NCCL_DEBUG": "WARN" }, - + "debug": false } - diff --git a/examples/slurm-configs/basic/03-multi-node-basic.json b/examples/slurm-configs/basic/03-multi-node-basic.json index 006890a7..e6f96584 100644 --- a/examples/slurm-configs/basic/03-multi-node-basic.json +++ b/examples/slurm-configs/basic/03-multi-node-basic.json @@ -3,10 +3,10 @@ "_description": "Configuration for distributed workload across 2 nodes with 8 GPUs per node (16 GPUs total)", "_use_case": "Multi-node distributed execution for large models (training or inference)", "_note": "Target is auto-detected as 'slurm' from presence of 'slurm' config section", - + "gpu_vendor": "AMD", "guest_os": "UBUNTU", - + "slurm": { "partition": "amd-rccl", "nodes": 2, @@ -16,7 +16,7 @@ "exclusive": true, "network_interface": "eth0" }, - + "distributed": { "launcher": "torchrun", "backend": "nccl", @@ -24,7 +24,7 @@ "nnodes": 2, "nproc_per_node": 8 }, - + "env_vars": { "NCCL_DEBUG": "WARN", "NCCL_DEBUG_SUBSYS": "INIT,NET", @@ -41,7 +41,6 @@ "HSA_FORCE_FINE_GRAIN_PCIE": "1", "RCCL_ENABLE_HIPGRAPH": "0" }, - + "debug": false } - diff --git a/examples/slurm-configs/basic/04-multi-node-advanced.json b/examples/slurm-configs/basic/04-multi-node-advanced.json index 1708f078..f25bb934 100644 --- a/examples/slurm-configs/basic/04-multi-node-advanced.json +++ b/examples/slurm-configs/basic/04-multi-node-advanced.json @@ -3,10 +3,10 @@ "_description": "Configuration for large-scale distributed workloads with advanced options", "_use_case": "Production-scale multi-node training with custom workspace and results collection", "_note": "Using 'amd-rccl' partition. Adjust for your cluster if needed.", - + "gpu_vendor": "AMD", "guest_os": "UBUNTU", - + "slurm": { "partition": "amd-rccl", "nodes": 4, @@ -25,7 +25,7 @@ "openmpi/4.1.4" ] }, - + "distributed": { "launcher": "torchrun", "backend": "nccl", @@ -33,7 +33,7 @@ "nnodes": 4, "nproc_per_node": 8 }, - + "env_vars": { "NCCL_DEBUG": "INFO", "NCCL_DEBUG_SUBSYS": "INIT,NET", @@ -53,9 +53,9 @@ "NCCL_BUFFSIZE": "8388608", "NCCL_P2P_LEVEL": "NVL" }, - + "shared_data": "/shared/datasets", - + "_notes": { "description": "Advanced configuration with InfiniBand, shared storage, and custom SLURM settings", "modules": "Load required environment modules before job execution", @@ -65,7 +65,6 @@ "shared_workspace": "Shared filesystem for job execution (NFS/Lustre)", "shared_data": "Shared dataset location accessible from all nodes" }, - + "debug": false } - diff --git a/examples/slurm-configs/basic/05-vllm-single-node.json b/examples/slurm-configs/basic/05-vllm-single-node.json index 7d77c4df..01d15eb8 100644 --- a/examples/slurm-configs/basic/05-vllm-single-node.json +++ b/examples/slurm-configs/basic/05-vllm-single-node.json @@ -3,10 +3,10 @@ "_description": "vLLM inference with tensor parallelism on single node", "_use_case": "High-throughput LLM inference on single node with multiple GPUs", "_note": "vLLM uses tensor parallelism to split model across GPUs", - + "gpu_vendor": "AMD", "guest_os": "UBUNTU", - + "slurm": { "partition": "amd-rccl", "nodes": 1, @@ -15,15 +15,15 @@ "output_dir": "./slurm_results", "exclusive": true }, - + "distributed": { "launcher": "vllm", "nnodes": 1, "nproc_per_node": 4 }, - + "pre_scripts": [], - + "env_vars": { "VLLM_ALLOW_LONG_MAX_MODEL_LEN": "1", "VLLM_USE_MODELSCOPE": "False", @@ -34,4 +34,3 @@ "NCCL_DEBUG": "WARN" } } - diff --git a/examples/slurm-configs/basic/06-vllm-multi-node.json b/examples/slurm-configs/basic/06-vllm-multi-node.json index d51262db..3763b0ad 100644 --- a/examples/slurm-configs/basic/06-vllm-multi-node.json +++ b/examples/slurm-configs/basic/06-vllm-multi-node.json @@ -10,10 +10,10 @@ "Better throughput - parallel processing", "Ideal for benchmarking and production serving" ], - + "gpu_vendor": "AMD", "guest_os": "UBUNTU", - + "slurm": { "partition": "amd-rccl", "nodes": 2, @@ -21,13 +21,13 @@ "time": "00:45:00", "output_dir": "./slurm_results", "exclusive": true, - + "_comment_node_check": "Preflight GPU health check (helps avoid OOM from stale processes)", "enable_node_check": true, "auto_cleanup_nodes": false, "verbose_node_check": false }, - + "distributed": { "launcher": "vllm", "nnodes": 2, @@ -36,25 +36,25 @@ "port": 29500, "_note": "Data Parallelism: Each node runs independently, no cross-node communication needed" }, - + "pre_scripts": [], - + "env_vars": { "VLLM_ALLOW_LONG_MAX_MODEL_LEN": "1", "VLLM_USE_MODELSCOPE": "False", "VLLM_WORKER_MULTIPROC_METHOD": "spawn", - + "_comment_memory": "Higher GPU utilization for Data Parallelism (no PP overhead)", "VLLM_KV_CACHE_SIZE": "0.8", "PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True", "HSA_FORCE_FINE_GRAIN_PCIE": "1", - + "_comment_timeouts": "Reduced timeouts for faster failure detection in DP mode", "NCCL_TIMEOUT": "300", "VLLM_ENGINE_ITERATION_TIMEOUT_S": "120", "RAY_health_check_timeout_ms": "30000", "RAY_gcs_rpc_server_reconnect_timeout_s": "60", - + "_comment_nccl": "NCCL settings for within-node tensor parallelism", "NCCL_DEBUG": "WARN", "NCCL_DEBUG_SUBSYS": "INIT,NET", @@ -63,4 +63,3 @@ "TORCH_NCCL_HIGH_PRIORITY": "1" } } - diff --git a/examples/slurm-configs/basic/07-sglang-single-node.json b/examples/slurm-configs/basic/07-sglang-single-node.json index 8aaae928..1acef300 100644 --- a/examples/slurm-configs/basic/07-sglang-single-node.json +++ b/examples/slurm-configs/basic/07-sglang-single-node.json @@ -3,10 +3,10 @@ "_description": "SGLang inference with tensor parallelism on single node", "_use_case": "High-throughput LLM inference on single node with multiple GPUs", "_note": "SGLang uses tensor parallelism to split model across GPUs", - + "gpu_vendor": "AMD", "guest_os": "UBUNTU", - + "slurm": { "partition": "amd-rccl", "nodes": 1, @@ -15,13 +15,13 @@ "output_dir": "./slurm_results", "exclusive": true }, - + "distributed": { "launcher": "sglang", "nnodes": 1, "nproc_per_node": 4 }, - + "env_vars": { "SGLANG_ALLOW_LONG_MAX_MODEL_LEN": "1", "SGLANG_USE_MODELSCOPE": "False", @@ -38,4 +38,3 @@ "RAY_BACKEND_LOG_LEVEL": "warning" } } - diff --git a/examples/slurm-configs/basic/08-sglang-multi-node.json b/examples/slurm-configs/basic/08-sglang-multi-node.json index 8485b93c..8db86870 100644 --- a/examples/slurm-configs/basic/08-sglang-multi-node.json +++ b/examples/slurm-configs/basic/08-sglang-multi-node.json @@ -3,10 +3,10 @@ "_description": "SGLang inference with tensor + data parallelism across nodes", "_use_case": "High-throughput LLM inference requiring multiple nodes", "_note": "SGLang uses tensor parallelism within nodes and data parallelism across nodes", - + "gpu_vendor": "AMD", "guest_os": "UBUNTU", - + "slurm": { "partition": "amd-rccl", "nodes": 2, @@ -15,7 +15,7 @@ "output_dir": "./slurm_results", "exclusive": true }, - + "distributed": { "launcher": "sglang", "nnodes": 2, @@ -23,7 +23,7 @@ "backend": "nccl", "port": 29500 }, - + "env_vars": { "SGLANG_ALLOW_LONG_MAX_MODEL_LEN": "1", "SGLANG_USE_MODELSCOPE": "False", @@ -44,4 +44,3 @@ "RAY_BACKEND_LOG_LEVEL": "warning" } } - diff --git a/examples/slurm-configs/basic/09-megatron-lm-multi-node.json b/examples/slurm-configs/basic/09-megatron-lm-multi-node.json index 84e3c3f6..b072bb71 100644 --- a/examples/slurm-configs/basic/09-megatron-lm-multi-node.json +++ b/examples/slurm-configs/basic/09-megatron-lm-multi-node.json @@ -3,10 +3,10 @@ "_description": "Large-scale transformer training with Megatron-LM on SLURM", "_use_case": "Multi-node Megatron-LM training with tensor and pipeline parallelism", "_reference": "https://github.com/NVIDIA/Megatron-LM", - + "gpu_vendor": "AMD", "guest_os": "UBUNTU", - + "slurm": { "partition": "gpu", "account": "research", @@ -15,20 +15,19 @@ "time": "24:00:00", "mem": "256G" }, - + "distributed": { "launcher": "megatron", "nnodes": 4, "nproc_per_node": 8, "master_port": 29500 }, - + "env_vars": { "OMP_NUM_THREADS": "16", "NCCL_DEBUG": "INFO", "NCCL_IB_DISABLE": "0" }, - + "debug": false } - diff --git a/examples/slurm-configs/basic/cluster-amd-rccl.json b/examples/slurm-configs/basic/cluster-amd-rccl.json index e70f8721..5cf0c623 100644 --- a/examples/slurm-configs/basic/cluster-amd-rccl.json +++ b/examples/slurm-configs/basic/cluster-amd-rccl.json @@ -8,10 +8,10 @@ "default_qos": "normal", "discovery_command": "sinfo -o '%P %.5a %.10l %.6D %.6t %N %G'" }, - + "gpu_vendor": "AMD", "guest_os": "UBUNTU", - + "slurm": { "partition": "amd-rccl", "account": "amd-rccl", @@ -22,12 +22,12 @@ "output_dir": "./slurm_results", "exclusive": true }, - + "distributed": { "backend": "nccl", "port": 29500 }, - + "env_vars": { "NCCL_DEBUG": "WARN", "NCCL_SOCKET_IFNAME": "eth0", @@ -35,7 +35,6 @@ "MIOPEN_FIND_MODE": "1", "MIOPEN_USER_DB_PATH": "/tmp/.miopen" }, - + "debug": false } - diff --git a/examples/slurm-configs/basic/sglang-disagg-custom-split.json b/examples/slurm-configs/basic/sglang-disagg-custom-split.json index f38bcf64..83be85ef 100644 --- a/examples/slurm-configs/basic/sglang-disagg-custom-split.json +++ b/examples/slurm-configs/basic/sglang-disagg-custom-split.json @@ -11,10 +11,10 @@ "custom_override": "4 prefill + 2 decode (4/2 split)" }, "_note": "Custom split allows optimization for prompt-heavy workloads", - + "gpu_vendor": "AMD", "guest_os": "UBUNTU", - + "slurm": { "partition": "amd-rccl", "nodes": 7, @@ -23,7 +23,7 @@ "output_dir": "./slurm_results", "exclusive": true }, - + "distributed": { "launcher": "sglang-disagg", "nnodes": 7, @@ -35,7 +35,7 @@ "decode_nodes": 2 } }, - + "env_vars": { "SGLANG_ALLOW_LONG_MAX_MODEL_LEN": "1", "SGLANG_USE_MODELSCOPE": "False", @@ -56,4 +56,3 @@ "RAY_BACKEND_LOG_LEVEL": "warning" } } - diff --git a/examples/slurm-configs/basic/sglang-disagg-multi-node.json b/examples/slurm-configs/basic/sglang-disagg-multi-node.json index 7dfbae19..4b24314a 100644 --- a/examples/slurm-configs/basic/sglang-disagg-multi-node.json +++ b/examples/slurm-configs/basic/sglang-disagg-multi-node.json @@ -10,10 +10,10 @@ "tensor_parallel": "8 GPUs per node" }, "_note": "SGLang Disaggregated separates prefill and decode into specialized clusters connected via Mooncake", - + "gpu_vendor": "AMD", "guest_os": "UBUNTU", - + "slurm": { "partition": "amd-rccl", "nodes": 5, @@ -22,7 +22,7 @@ "output_dir": "./slurm_results", "exclusive": true }, - + "distributed": { "launcher": "sglang-disagg", "nnodes": 5, @@ -30,7 +30,7 @@ "backend": "nccl", "port": 29500 }, - + "env_vars": { "SGLANG_ALLOW_LONG_MAX_MODEL_LEN": "1", "SGLANG_USE_MODELSCOPE": "False", @@ -53,4 +53,3 @@ "MOONCAKE_TEST_MODE": "0" } } - diff --git a/examples/slurm-configs/minimal/deepspeed-minimal.json b/examples/slurm-configs/minimal/deepspeed-minimal.json index ae105389..a92287a3 100644 --- a/examples/slurm-configs/minimal/deepspeed-minimal.json +++ b/examples/slurm-configs/minimal/deepspeed-minimal.json @@ -2,23 +2,23 @@ "_comment": "DeepSpeed Config - Uses deepspeed launcher", "_description": "DeepSpeed with ZeRO-1 optimization", "_use_case": "Test DeepSpeed distributed training on SLURM (training-specific launcher)", - + "gpu_vendor": "AMD", "guest_os": "UBUNTU", - + "slurm": { "partition": "amd-rccl", "nodes": 1, "gpus_per_node": 4, "time": "02:00:00" }, - + "distributed": { "launcher": "deepspeed", "nnodes": 1, "nproc_per_node": 4 }, - + "env_vars": { "DEEPSPEED_LAUNCHER": "deepspeed" } diff --git a/examples/slurm-configs/minimal/megatron-lm-minimal.json b/examples/slurm-configs/minimal/megatron-lm-minimal.json index 9480359e..828daee9 100644 --- a/examples/slurm-configs/minimal/megatron-lm-minimal.json +++ b/examples/slurm-configs/minimal/megatron-lm-minimal.json @@ -2,23 +2,23 @@ "_comment": "Megatron-LM Minimal Config - Dedicated launcher support", "_description": "Megatron-LM with automated tensor/pipeline parallelism setup", "_use_case": "Large-scale transformer training with Megatron-LM on SLURM", - + "gpu_vendor": "AMD", "guest_os": "UBUNTU", - + "slurm": { "partition": "amd-rccl", "nodes": 1, "gpus_per_node": 2, "time": "02:00:00" }, - + "distributed": { "launcher": "megatron", "nnodes": 1, "nproc_per_node": 2 }, - + "env_vars": { "OMP_NUM_THREADS": "8" } diff --git a/examples/slurm-configs/minimal/sglang-disagg-minimal.json b/examples/slurm-configs/minimal/sglang-disagg-minimal.json index ee4ad9f2..03bce46c 100644 --- a/examples/slurm-configs/minimal/sglang-disagg-minimal.json +++ b/examples/slurm-configs/minimal/sglang-disagg-minimal.json @@ -2,21 +2,20 @@ "_comment": "Minimal SGLang Disaggregated configuration - 3 nodes minimum", "_description": "SGLang disaggregated inference with 3 nodes (1 proxy + 1 prefill + 1 decode)", "_architecture": "Node 0: Proxy, Node 1: Prefill, Node 2: Decode", - + "gpu_vendor": "AMD", "guest_os": "UBUNTU", - + "slurm": { "partition": "gpu", "nodes": 3, "gpus_per_node": 1, "time": "02:00:00" }, - + "distributed": { "launcher": "sglang-disagg", "nnodes": 3, "nproc_per_node": 1 } } - diff --git a/examples/slurm-configs/minimal/sglang-multi-node-minimal.json b/examples/slurm-configs/minimal/sglang-multi-node-minimal.json index 057b5004..7c318d19 100644 --- a/examples/slurm-configs/minimal/sglang-multi-node-minimal.json +++ b/examples/slurm-configs/minimal/sglang-multi-node-minimal.json @@ -1,21 +1,20 @@ { "_comment": "Minimal SGLang multi-node configuration", "_description": "SGLang inference with 2 nodes, 4 GPUs per node", - + "gpu_vendor": "AMD", "guest_os": "UBUNTU", - + "slurm": { "partition": "amd-rccl", "nodes": 2, "gpus_per_node": 4, "time": "04:00:00" }, - + "distributed": { "launcher": "sglang", "nnodes": 2, "nproc_per_node": 4 } } - diff --git a/examples/slurm-configs/minimal/sglang-single-node-minimal.json b/examples/slurm-configs/minimal/sglang-single-node-minimal.json index 7e2eae97..b0b306dc 100644 --- a/examples/slurm-configs/minimal/sglang-single-node-minimal.json +++ b/examples/slurm-configs/minimal/sglang-single-node-minimal.json @@ -1,21 +1,20 @@ { "_comment": "Minimal SGLang single-node configuration", "_description": "SGLang inference with 4 GPUs tensor parallelism", - + "gpu_vendor": "AMD", "guest_os": "UBUNTU", - + "slurm": { "partition": "amd-rccl", "nodes": 1, "gpus_per_node": 4, "time": "02:00:00" }, - + "distributed": { "launcher": "sglang", "nnodes": 1, "nproc_per_node": 4 } } - diff --git a/examples/slurm-configs/minimal/torchrun-multi-gpu-minimal.json b/examples/slurm-configs/minimal/torchrun-multi-gpu-minimal.json index c8479d58..721b47e6 100644 --- a/examples/slurm-configs/minimal/torchrun-multi-gpu-minimal.json +++ b/examples/slurm-configs/minimal/torchrun-multi-gpu-minimal.json @@ -1,7 +1,7 @@ { "_comment": "Minimal multi-GPU SLURM configuration (8 GPUs, single node)", "_note": "Using 'amd-rccl' partition (default for this cluster)", - + "gpu_vendor": "AMD", "guest_os": "UBUNTU", @@ -14,4 +14,3 @@ "launcher": "torchrun" } } - diff --git a/examples/slurm-configs/minimal/torchrun-multi-node-minimal.json b/examples/slurm-configs/minimal/torchrun-multi-node-minimal.json index e00262bf..379a0012 100644 --- a/examples/slurm-configs/minimal/torchrun-multi-node-minimal.json +++ b/examples/slurm-configs/minimal/torchrun-multi-node-minimal.json @@ -4,7 +4,7 @@ "gpu_vendor": "AMD", "guest_os": "UBUNTU", - + "slurm": { "partition": "amd-rccl", "nodes": 2, @@ -15,4 +15,3 @@ "launcher": "torchrun" } } - diff --git a/examples/slurm-configs/minimal/torchrun-single-gpu-minimal.json b/examples/slurm-configs/minimal/torchrun-single-gpu-minimal.json index 4151f94a..fafc7d86 100644 --- a/examples/slurm-configs/minimal/torchrun-single-gpu-minimal.json +++ b/examples/slurm-configs/minimal/torchrun-single-gpu-minimal.json @@ -4,11 +4,10 @@ "gpu_vendor": "AMD", "guest_os": "UBUNTU", - + "slurm": { "partition": "amd-rccl", "gpus_per_node": 1, "time": "01:00:00" } } - diff --git a/examples/slurm-configs/minimal/torchtitan-multi-node-minimal.json b/examples/slurm-configs/minimal/torchtitan-multi-node-minimal.json index 0b227a99..9a3dba5f 100644 --- a/examples/slurm-configs/minimal/torchtitan-multi-node-minimal.json +++ b/examples/slurm-configs/minimal/torchtitan-multi-node-minimal.json @@ -5,7 +5,7 @@ "gpu_vendor": "AMD", "guest_os": "UBUNTU", - + "slurm": { "partition": "amd-rccl", "nodes": 4, @@ -14,11 +14,10 @@ "mem": "512G", "constraint": "MI300X" }, - + "distributed": { "launcher": "torchtitan", "nnodes": 4, "nproc_per_node": 8 } } - diff --git a/examples/slurm-configs/minimal/torchtitan-single-node-minimal.json b/examples/slurm-configs/minimal/torchtitan-single-node-minimal.json index 4b7f532a..4ed9c837 100644 --- a/examples/slurm-configs/minimal/torchtitan-single-node-minimal.json +++ b/examples/slurm-configs/minimal/torchtitan-single-node-minimal.json @@ -5,7 +5,7 @@ "gpu_vendor": "AMD", "guest_os": "UBUNTU", - + "slurm": { "partition": "amd-rccl", "nodes": 1, @@ -13,9 +13,8 @@ "time": "24:00:00", "mem": "256G" }, - + "distributed": { "launcher": "torchtitan" } } - diff --git a/examples/slurm-configs/minimal/vllm-multi-node-minimal.json b/examples/slurm-configs/minimal/vllm-multi-node-minimal.json index 0a77b5ea..ff970f6f 100644 --- a/examples/slurm-configs/minimal/vllm-multi-node-minimal.json +++ b/examples/slurm-configs/minimal/vllm-multi-node-minimal.json @@ -1,10 +1,10 @@ { "_comment": "Minimal vLLM multi-node configuration", "_description": "vLLM inference with 2 nodes, 4 GPUs per node", - + "gpu_vendor": "AMD", "guest_os": "UBUNTU", - + "slurm": { "partition": "amd-rccl", "nodes": 2, @@ -13,13 +13,13 @@ "enable_node_check": true, "auto_cleanup_nodes": false }, - + "distributed": { "launcher": "vllm", "nnodes": 2, "nproc_per_node": 4 }, - + "env_vars": { "VLLM_KV_CACHE_SIZE": "0.5", "PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True", @@ -27,7 +27,6 @@ "VLLM_ENGINE_ITERATION_TIMEOUT_S": "180", "RAY_health_check_timeout_ms": "60000" }, - + "pre_scripts": [] } - diff --git a/examples/slurm-configs/minimal/vllm-single-node-minimal.json b/examples/slurm-configs/minimal/vllm-single-node-minimal.json index 14c9b843..2072ac36 100644 --- a/examples/slurm-configs/minimal/vllm-single-node-minimal.json +++ b/examples/slurm-configs/minimal/vllm-single-node-minimal.json @@ -4,25 +4,24 @@ "gpu_vendor": "AMD", "guest_os": "UBUNTU", - + "slurm": { "partition": "amd-rccl", "nodes": 1, "gpus_per_node": 4, "time": "02:00:00" }, - + "distributed": { "launcher": "vllm", "nnodes": 1, "nproc_per_node": 4 }, - + "env_vars": { "VLLM_KV_CACHE_SIZE": "0.7", "PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True" }, - + "pre_scripts": [] } - diff --git a/pyproject.toml b/pyproject.toml index 0c83f30a..97be986d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,6 +26,8 @@ dependencies = [ "click>=8.0.0", "jinja2>=3.0.0", "pyyaml>=6.0", + "hydra-core>=1.3", + "omegaconf>=2.3", ] classifiers = [ "Programming Language :: Python :: 3", @@ -80,6 +82,7 @@ all = [ [tool.hatch.build.targets.wheel.force-include] "src/madengine/scripts" = "madengine/scripts" "src/madengine/deployment/templates" = "madengine/deployment/templates" +"src/madengine/configs" = "madengine/configs" [tool.hatch.version] source = "versioningit" diff --git a/setup.py b/setup.py index dab8c8c4..7a6bac74 100644 --- a/setup.py +++ b/setup.py @@ -27,21 +27,23 @@ print("Install it using: pip install setuptools") sys.exit(1) + def read_readme(readme_file="README.md"): """Read README.md file for long description.""" readme_path = Path(__file__).parent / readme_file if readme_path.exists(): with open(readme_path, "r", encoding="utf-8") as f: return f.read() - + # Fallback to README.md if specified file doesn't exist fallback_path = Path(__file__).parent / "README.md" if fallback_path.exists() and readme_file != "README.md": with open(fallback_path, "r", encoding="utf-8") as f: return f.read() - + return "" + def get_config_from_pyproject(): """Read configuration from pyproject.toml.""" try: @@ -52,30 +54,32 @@ def get_config_from_pyproject(): except ImportError: try: import toml as tomllib_alt + def load(f): - if hasattr(f, 'read'): + if hasattr(f, "read"): content = f.read() if isinstance(content, bytes): - content = content.decode('utf-8') + content = content.decode("utf-8") return tomllib_alt.loads(content) else: return tomllib_alt.load(f) + tomllib.load = load except ImportError: print("Warning: No TOML library found. Using fallback configuration.") return get_fallback_config() - + pyproject_path = Path(__file__).parent / "pyproject.toml" if not pyproject_path.exists(): print("Warning: pyproject.toml not found. Using fallback configuration.") return get_fallback_config() - + try: with open(pyproject_path, "rb") as f: data = tomllib.load(f) - + project = data.get("project", {}) - + # Extract configuration config = { "name": project.get("name", "madengine"), @@ -89,13 +93,14 @@ def load(f): "scripts": project.get("scripts", {}), "readme": project.get("readme", "README.md"), } - + return config - + except Exception as e: print(f"Warning: Could not read pyproject.toml: {e}") return get_fallback_config() + def get_fallback_config(): """Fallback configuration if pyproject.toml cannot be read.""" return { @@ -103,14 +108,26 @@ def get_fallback_config(): "description": "MAD Engine is a set of interfaces to run various AI models from public MAD.", "authors": [{"name": "Advanced Micro Devices", "email": "mad.support@amd.com"}], "dependencies": [ - "pandas", "GitPython", "jsondiff", "sqlalchemy", "setuptools-rust", - "paramiko", "tqdm", "pytest", - "typing-extensions", "pymongo", "toml", + "pandas", + "GitPython", + "jsondiff", + "sqlalchemy", + "setuptools-rust", + "paramiko", + "tqdm", + "pytest", + "typing-extensions", + "pymongo", + "toml", ], "optional_dependencies": { "dev": [ - "pytest", "pytest-cov", "pytest-xdist", "pytest-timeout", - "pytest-mock", "pytest-asyncio", + "pytest", + "pytest-cov", + "pytest-xdist", + "pytest-timeout", + "pytest-mock", + "pytest-asyncio", ] }, "requires_python": ">=3.8", @@ -123,52 +140,58 @@ def get_fallback_config(): "Homepage": "https://github.com/ROCm/madengine", "Issues": "https://github.com/ROCm/madengine/issues", }, - "scripts": { - "madengine": "madengine.cli.app:cli_main" - }, + "scripts": {"madengine": "madengine.cli.app:cli_main"}, } + def get_version(): """Get version from git tags or fallback to a default.""" try: import subprocess import re - + # Try to get version from git describe first (more accurate) try: result = subprocess.run( ["git", "describe", "--tags", "--dirty", "--always", "--long"], - capture_output=True, text=True, timeout=10, cwd=Path(__file__).parent + capture_output=True, + text=True, + timeout=10, + cwd=Path(__file__).parent, ) if result.returncode == 0: version_str = result.stdout.strip() - + # Handle case where there are no tags yet - if not version_str or len(version_str.split('-')) < 3: + if not version_str or len(version_str.split("-")) < 3: # Try to get just the commit hash result = subprocess.run( ["git", "rev-parse", "--short", "HEAD"], - capture_output=True, text=True, timeout=10, cwd=Path(__file__).parent + capture_output=True, + text=True, + timeout=10, + cwd=Path(__file__).parent, ) if result.returncode == 0: commit = result.stdout.strip() # Check if dirty dirty_result = subprocess.run( ["git", "diff-index", "--quiet", "HEAD", "--"], - capture_output=True, cwd=Path(__file__).parent + capture_output=True, + cwd=Path(__file__).parent, ) is_dirty = dirty_result.returncode != 0 if is_dirty: return f"2.0.0.dev0+g{commit}.dirty" else: return f"2.0.0.dev0+g{commit}" - + # Clean up the version string to be PEP 440 compliant - if version_str.startswith('v'): + if version_str.startswith("v"): version_str = version_str[1:] - + # Handle patterns like "1.0.0-5-g1234567" or "1.0.0-5-g1234567-dirty" - match = re.match(r'^([^-]+)-(\d+)-g([a-f0-9]+)(-dirty)?$', version_str) + match = re.match(r"^([^-]+)-(\d+)-g([a-f0-9]+)(-dirty)?$", version_str) if match: base_version, distance, commit, dirty = match.groups() if distance == "0": @@ -183,40 +206,44 @@ def get_version(): if dirty: version_str += ".dirty" return version_str - + # Handle case where we just have a commit hash (no tags) - if re.match(r'^[a-f0-9]+(-dirty)?$', version_str): - clean_hash = version_str.replace('-dirty', '') - if '-dirty' in version_str: + if re.match(r"^[a-f0-9]+(-dirty)?$", version_str): + clean_hash = version_str.replace("-dirty", "") + if "-dirty" in version_str: return f"2.0.0.dev0+g{clean_hash}.dirty" else: return f"2.0.0.dev0+g{clean_hash}" - + return version_str - + except (subprocess.SubprocessError, FileNotFoundError): pass - + # Fallback to short commit hash result = subprocess.run( ["git", "rev-parse", "--short", "HEAD"], - capture_output=True, text=True, timeout=10, cwd=Path(__file__).parent + capture_output=True, + text=True, + timeout=10, + cwd=Path(__file__).parent, ) if result.returncode == 0: commit = result.stdout.strip() return f"2.0.0.dev0+g{commit}" - + except Exception: pass - + # Final fallback return "2.0.0.dev0" + def main(): """Main setup function.""" try: config = get_config_from_pyproject() - + # Extract author information authors = config.get("authors", []) if authors: @@ -225,42 +252,45 @@ def main(): else: author_name = "Advanced Micro Devices" author_email = "mad.support@amd.com" - + # Extract scripts/entry points scripts = config.get("scripts", {}) entry_points = {"console_scripts": []} for script_name, module_path in scripts.items(): entry_points["console_scripts"].append(f"{script_name}={module_path}") - + # Find all packages packages = find_packages(where="src") if not packages: print("Warning: No packages found in src/ directory") # Fallback: look for madengine package specifically import os + src_path = Path(__file__).parent / "src" if (src_path / "madengine").exists(): packages = ["madengine"] + [ f"madengine.{name}" for name in find_packages(where="src/madengine") ] - + # Setup package data to include scripts package_data = {"madengine": ["scripts/**/*"]} - + # Check if scripts directory exists and add patterns accordingly scripts_path = Path(__file__).parent / "src" / "madengine" / "scripts" if scripts_path.exists(): # Add more specific patterns to ensure all script files are included - package_data["madengine"].extend([ - "scripts/*", - "scripts/*/*", - "scripts/*/*/*", - "scripts/*/*/*/*", - ]) - + package_data["madengine"].extend( + [ + "scripts/*", + "scripts/*/*", + "scripts/*/*/*", + "scripts/*/*/*/*", + ] + ) + # Get version version = get_version() - + # Setup configuration setup_kwargs = { "name": config["name"], @@ -284,24 +314,28 @@ def main(): "zip_safe": False, "platforms": ["any"], } - + # Remove None values to avoid setuptools warnings setup_kwargs = {k: v for k, v in setup_kwargs.items() if v is not None} - + # Print some info for debugging - if len(sys.argv) > 1 and any(arg in sys.argv for arg in ["--version", "--help", "--help-commands"]): + if len(sys.argv) > 1 and any( + arg in sys.argv for arg in ["--version", "--help", "--help-commands"] + ): print(f"madengine version: {version}") print(f"Found {len(packages)} packages") if entry_points and entry_points["console_scripts"]: print(f"Console scripts: {', '.join(entry_points['console_scripts'])}") - + setup(**setup_kwargs) - + except Exception as e: print(f"Error during setup: {e}") import traceback + traceback.print_exc() sys.exit(1) + if __name__ == "__main__": main() diff --git a/src/madengine/__init__.py b/src/madengine/__init__.py index f121d08e..91a9ea0d 100644 --- a/src/madengine/__init__.py +++ b/src/madengine/__init__.py @@ -11,7 +11,7 @@ Copyright (c) Advanced Micro Devices, Inc. All rights reserved. """ -from importlib.metadata import version, PackageNotFoundError +from importlib.metadata import PackageNotFoundError, version try: __version__ = version("madengine") diff --git a/src/madengine/cli/__init__.py b/src/madengine/cli/__init__.py index 2ac185c2..5c83baa1 100644 --- a/src/madengine/cli/__init__.py +++ b/src/madengine/cli/__init__.py @@ -10,26 +10,28 @@ # Import for backward compatibility from .app import app, cli_main -from .constants import ExitCode, VALID_GPU_VENDORS, VALID_GUEST_OS from .constants import ( + DEFAULT_DATA_CONFIG, DEFAULT_MANIFEST_FILE, DEFAULT_PERF_OUTPUT, - DEFAULT_DATA_CONFIG, - DEFAULT_TOOLS_CONFIG, DEFAULT_TIMEOUT, + DEFAULT_TOOLS_CONFIG, + VALID_GPU_VENDORS, + VALID_GUEST_OS, + ExitCode, ) from .utils import ( - setup_logging, - split_comma_separated_tags, create_args_namespace, - save_summary_with_feedback, - display_results_table, display_performance_table, + display_results_table, + save_summary_with_feedback, + setup_logging, + split_comma_separated_tags, ) from .validators import ( - validate_additional_context, process_batch_manifest, process_batch_manifest_entries, + validate_additional_context, ) __all__ = [ @@ -53,4 +55,3 @@ "process_batch_manifest", "process_batch_manifest_entries", ] - diff --git a/src/madengine/cli/app.py b/src/madengine/cli/app.py index 2e761f49..b4871353 100644 --- a/src/madengine/cli/app.py +++ b/src/madengine/cli/app.py @@ -8,7 +8,8 @@ """ import sys -from importlib.metadata import PackageNotFoundError, version as pkg_version +from importlib.metadata import PackageNotFoundError +from importlib.metadata import version as pkg_version import typer from rich.traceback import install @@ -18,7 +19,7 @@ except ImportError: from typing_extensions import Annotated # Python 3.8 -from .commands import build, run, discover, report_app, database +from .commands import build, database, discover, report_app, run from .constants import ExitCode from .utils import console @@ -89,4 +90,3 @@ def cli_main() -> None: if __name__ == "__main__": cli_main() - diff --git a/src/madengine/cli/commands/__init__.py b/src/madengine/cli/commands/__init__.py index f77b432e..c88f662c 100644 --- a/src/madengine/cli/commands/__init__.py +++ b/src/madengine/cli/commands/__init__.py @@ -8,10 +8,9 @@ """ from .build import build -from .run import run +from .database import database from .discover import discover from .report import report_app -from .database import database +from .run import run __all__ = ["build", "run", "discover", "report_app", "database"] - diff --git a/src/madengine/cli/commands/build.py b/src/madengine/cli/commands/build.py index 5b10a65c..f82331c8 100644 --- a/src/madengine/cli/commands/build.py +++ b/src/madengine/cli/commands/build.py @@ -29,7 +29,11 @@ save_summary_with_feedback, display_results_table, ) -from ..validators import validate_additional_context, process_batch_manifest, process_batch_manifest_entries +from ..validators import ( + validate_additional_context, + process_batch_manifest, + process_batch_manifest_entries, +) def build( @@ -40,9 +44,9 @@ def build( target_archs: Annotated[ List[str], typer.Option( - "--target-archs", - "-a", - help="Target GPU architectures to build for (e.g., gfx908,gfx90a,gfx942). If not specified, builds single image with MAD_SYSTEM_GPU_ARCHITECTURE from additional_context or detected GPU architecture." + "--target-archs", + "-a", + help="Target GPU architectures to build for (e.g., gfx908,gfx90a,gfx942). If not specified, builds single image with MAD_SYSTEM_GPU_ARCHITECTURE from additional_context or detected GPU architecture.", ), ] = [], registry: Annotated[ @@ -69,6 +73,17 @@ def build( help="File containing additional context JSON", ), ] = None, + config: Annotated[ + Optional[List[str]], + typer.Option( + "--config", + help=( + "YAML config file and/or Hydra overrides " + "(e.g., --config my_job.yaml, --config scheduler=slurm --config launcher=torchrun). " + "Cannot be combined with --additional-context or --additional-context-file." + ), + ), + ] = None, clean_docker_cache: Annotated[ bool, typer.Option("--clean-docker-cache", help="Rebuild images without using cache"), @@ -102,15 +117,47 @@ def build( # Process tags to handle comma-separated values # Supports both: --tags dummy --tags multi AND --tags dummy,multi processed_tags = split_comma_separated_tags(tags) - + + # --config is mutually exclusive with --additional-context and --additional-context-file + if config: + if additional_context and additional_context.strip() not in ("", "{}"): + console.print( + "[red]Error:[/red] --config cannot be used together with --additional-context. " + "Use one or the other.", + style="bold", + ) + raise typer.Exit(code=ExitCode.INVALID_ARGS.value) + if additional_context_file: + console.print( + "[red]Error:[/red] --config cannot be used together with --additional-context-file. " + "Use one or the other.", + style="bold", + ) + raise typer.Exit(code=ExitCode.INVALID_ARGS.value) + + from madengine.config import load_config + + config_ctx, config_meta = load_config(config) + + if not processed_tags and config_meta.get("model", {}).get("tags"): + processed_tags = config_meta["model"]["tags"] + if not registry and config_meta.get("build", {}).get("registry"): + registry = config_meta["build"]["registry"] + build_meta = config_meta.get("build", {}) + if not target_archs and build_meta.get("target_archs"): + target_archs = build_meta["target_archs"] + + additional_context = repr(config_ctx) + additional_context_file = None + # Validate mutually exclusive options if batch_manifest and processed_tags: console.print( "❌ [bold red]Error: Cannot specify both --batch-manifest and --tags options[/bold red]" ) raise typer.Exit(ExitCode.INVALID_ARGS) - - if additional_context_file and additional_context!="{}": + + if additional_context_file and additional_context != "{}": console.print( "❌ [bold red]Error: Cannot specify both --additional-context-file and --additional-context options[/bold red]" ) @@ -200,7 +247,7 @@ def build( console=console, ) as progress: task = progress.add_task("Initializing build orchestrator...", total=None) - + # Use new BuildOrchestrator orchestrator = BuildOrchestrator(args) progress.update(task, description="Building models...") @@ -212,12 +259,12 @@ def build( manifest_output=manifest_output, batch_build_metadata=batch_build_metadata, ) - + # Load build summary for display - with open(manifest_output, 'r') as f: + with open(manifest_output, "r") as f: manifest = json.load(f) build_summary = manifest.get("summary", {}) - + progress.update(task, description="Build completed!") # Handle batch manifest post-processing @@ -240,7 +287,7 @@ def build( # Check results and exit with appropriate code failed_builds = len(build_summary.get("failed_builds", [])) successful_builds = len(build_summary.get("successful_builds", [])) - + if failed_builds == 0: console.print( "🎉 [bold green]All builds completed successfully![/bold green]" @@ -258,9 +305,7 @@ def build( raise typer.Exit(ExitCode.BUILD_FAILURE) # Non-zero exit for CI/CD else: # All failed - console.print( - f"💥 [bold red]All builds failed[/bold red]" - ) + console.print(f"💥 [bold red]All builds failed[/bold red]") raise typer.Exit(ExitCode.BUILD_FAILURE) except typer.Exit: @@ -268,52 +313,52 @@ def build( except BuildError as e: # Specific build error handling console.print(f"💥 [bold red]Build error: {e}[/bold red]") - if hasattr(e, 'suggestions') and e.suggestions: + if hasattr(e, "suggestions") and e.suggestions: console.print("\n💡 [cyan]Suggestions:[/cyan]") for suggestion in e.suggestions: console.print(f" • {suggestion}") raise typer.Exit(ExitCode.BUILD_FAILURE) - + except ConfigurationError as e: # Configuration errors console.print(f"⚙️ [bold red]Configuration error: {e}[/bold red]") - if hasattr(e, 'suggestions') and e.suggestions: + if hasattr(e, "suggestions") and e.suggestions: console.print("\n💡 [cyan]Suggestions:[/cyan]") for suggestion in e.suggestions: console.print(f" • {suggestion}") raise typer.Exit(ExitCode.INVALID_ARGS) - + except DiscoveryError as e: # Model discovery errors console.print(f"🔍 [bold red]Discovery error: {e}[/bold red]") console.print("💡 Check MODEL_DIR or models.json configuration") raise typer.Exit(ExitCode.FAILURE) - + except KeyboardInterrupt: console.print("\n🛑 [yellow]Build cancelled by user[/yellow]") raise typer.Exit(ExitCode.FAILURE) - + except PermissionError as e: console.print(f"🔒 [bold red]Permission denied: {e}[/bold red]") - console.print("💡 Check file/directory permissions or run with appropriate privileges") + console.print( + "💡 Check file/directory permissions or run with appropriate privileges" + ) raise typer.Exit(ExitCode.FAILURE) - + except FileNotFoundError as e: console.print(f"📁 [bold red]File not found: {e}[/bold red]") console.print("💡 Check that all required files exist") raise typer.Exit(ExitCode.FAILURE) - + except Exception as e: console.print(f"💥 [bold red]Unexpected error: {e}[/bold red]") if verbose: console.print_exception() - + from madengine.core.errors import handle_error, create_error_context + context = create_error_context( - operation="build", - phase="build", - component="build_command" + operation="build", phase="build", component="build_command" ) handle_error(e, context=context) raise typer.Exit(ExitCode.FAILURE) - diff --git a/src/madengine/cli/commands/database.py b/src/madengine/cli/commands/database.py index 8f804e06..2d5b699d 100644 --- a/src/madengine/cli/commands/database.py +++ b/src/madengine/cli/commands/database.py @@ -22,7 +22,7 @@ from madengine.database.mongodb import ( upload_file_to_mongodb, MongoDBConfig, - UploadOptions + UploadOptions, ) from ..constants import ExitCode from ..utils import setup_logging @@ -33,85 +33,64 @@ def database( file: Annotated[ str, - typer.Option( - "--file", "-f", - help="Path to file (CSV or JSON, auto-detected)" - ), + typer.Option("--file", "-f", help="Path to file (CSV or JSON, auto-detected)"), ], database: Annotated[ str, - typer.Option( - "--database", "--db", - help="MongoDB database name" - ), + typer.Option("--database", "--db", help="MongoDB database name"), ], collection: Annotated[ str, - typer.Option( - "--collection", "-c", - help="MongoDB collection name" - ), + typer.Option("--collection", "-c", help="MongoDB collection name"), ], unique_key: Annotated[ str, typer.Option( - "--unique-key", "-k", - help="Unique field(s) for deduplication (comma-separated, auto-detected if not specified)" + "--unique-key", + "-k", + help="Unique field(s) for deduplication (comma-separated, auto-detected if not specified)", ), ] = None, batch_size: Annotated[ int, - typer.Option( - "--batch-size", - help="Batch size for bulk operations" - ), + typer.Option("--batch-size", help="Batch size for bulk operations"), ] = 1000, no_upsert: Annotated[ bool, typer.Option( - "--no-upsert", - help="Insert only (don't update existing documents)" + "--no-upsert", help="Insert only (don't update existing documents)" ), ] = False, no_index: Annotated[ bool, - typer.Option( - "--no-index", - help="Skip automatic index creation" - ), + typer.Option("--no-index", help="Skip automatic index creation"), ] = False, dry_run: Annotated[ bool, - typer.Option( - "--dry-run", - help="Validate without uploading" - ), + typer.Option("--dry-run", help="Validate without uploading"), ] = False, verbose: Annotated[ bool, - typer.Option( - "--verbose", "-v", - help="Verbose output" - ), + typer.Option("--verbose", "-v", help="Verbose output"), ] = False, ) -> None: """ 💾 Upload CSV or JSON files to MongoDB. - + Supports intelligent type preservation, automatic deduplication, and bulk operations for optimal performance. - + \b Examples: # Upload JSON with auto-detection madengine database -f perf_entry_super.json --db mydb -c perf_super - + # Upload CSV with custom unique key madengine database -f perf.csv --db test -c results -k model,timestamp - + # Dry run to validate madengine database -f data.json --db test -c data --dry-run - + \b Environment Variables: MONGO_HOST MongoDB host (default: localhost) @@ -119,12 +98,12 @@ def database( MONGO_USER MongoDB username MONGO_PASSWORD MongoDB password """ - + setup_logging(verbose) - + # Display configuration file_path = Path(file) - + console.print( Panel( f"💾 [bold cyan]MongoDB Upload[/bold cyan]\n\n" @@ -136,29 +115,29 @@ def database( border_style="cyan", ) ) - + # Validate file exists if not file_path.exists(): console.print(f"❌ [bold red]File not found: {file}[/bold red]") raise typer.Exit(ExitCode.FAILURE) - + # Prepare configuration config = MongoDBConfig.from_env() - + # Parse unique fields unique_fields = None if unique_key: - unique_fields = [k.strip() for k in unique_key.split(',')] - + unique_fields = [k.strip() for k in unique_key.split(",")] + # Prepare options options = UploadOptions( unique_fields=unique_fields, upsert=not no_upsert, batch_size=batch_size, create_indexes=not no_index, - dry_run=dry_run + dry_run=dry_run, ) - + try: # Perform upload result = upload_file_to_mongodb( @@ -166,13 +145,13 @@ def database( database_name=database, collection_name=collection, config=config, - options=options + options=options, ) - + # Display results console.print() result.print_summary() - + # Show errors if any if result.errors and verbose: console.print("\n⚠️ [yellow]Errors:[/yellow]") @@ -180,15 +159,19 @@ def database( console.print(f" {i}. {error}") if len(result.errors) > 10: console.print(f" ... and {len(result.errors) - 10} more errors") - + # Exit with appropriate code if result.status == "success": raise typer.Exit(ExitCode.SUCCESS) elif result.status == "partial": - raise typer.Exit(ExitCode.SUCCESS if result.documents_inserted + result.documents_updated > 0 else ExitCode.FAILURE) + raise typer.Exit( + ExitCode.SUCCESS + if result.documents_inserted + result.documents_updated > 0 + else ExitCode.FAILURE + ) else: raise typer.Exit(ExitCode.FAILURE) - + except typer.Exit: # Re-raise typer.Exit without catching it raise diff --git a/src/madengine/cli/commands/discover.py b/src/madengine/cli/commands/discover.py index 867a50e0..c160c3c8 100644 --- a/src/madengine/cli/commands/discover.py +++ b/src/madengine/cli/commands/discover.py @@ -18,13 +18,20 @@ from madengine.utils.discover_models import DiscoverModels from ..constants import ExitCode -from ..utils import console, setup_logging, split_comma_separated_tags, create_args_namespace +from ..utils import ( + console, + setup_logging, + split_comma_separated_tags, + create_args_namespace, +) def discover( tags: Annotated[ List[str], - typer.Option("--tags", "-t", help="Model tags to discover (can specify multiple)"), + typer.Option( + "--tags", "-t", help="Model tags to discover (can specify multiple)" + ), ] = [], verbose: Annotated[ bool, typer.Option("--verbose", "-v", help="Enable verbose logging") @@ -58,17 +65,18 @@ def discover( try: # Create args namespace similar to mad.py args = create_args_namespace(tags=processed_tags) - + # Use DiscoverModels class # Note: DiscoverModels prints output directly and returns None discover_models_instance = DiscoverModels(args=args) result = discover_models_instance.run() - - console.print("✅ [bold green]Model discovery completed successfully[/bold green]") + + console.print( + "✅ [bold green]Model discovery completed successfully[/bold green]" + ) except Exception as e: console.print(f"💥 [bold red]Model discovery failed: {e}[/bold red]") if verbose: console.print_exception() raise typer.Exit(ExitCode.FAILURE) - diff --git a/src/madengine/cli/commands/report.py b/src/madengine/cli/commands/report.py index 2bd348c0..e73c046e 100644 --- a/src/madengine/cli/commands/report.py +++ b/src/madengine/cli/commands/report.py @@ -19,12 +19,11 @@ except ImportError: from typing_extensions import Annotated # Python 3.8 -from madengine.reporting.csv_to_html import ConvertCsvToHtml from madengine.reporting.csv_to_email import ConvertCsvToEmail +from madengine.reporting.csv_to_html import ConvertCsvToHtml from ..constants import ExitCode -from ..utils import console, setup_logging, create_args_namespace - +from ..utils import console, create_args_namespace, setup_logging # Create a sub-app for report commands report_app = typer.Typer( @@ -39,10 +38,7 @@ def to_html( csv_file_path: Annotated[ str, - typer.Option( - "--csv-file-path", - help="Path to the CSV file to convert to HTML" - ), + typer.Option("--csv-file-path", help="Path to the CSV file to convert to HTML"), ], verbose: Annotated[ bool, typer.Option("--verbose", "-v", help="Enable verbose logging") @@ -50,10 +46,10 @@ def to_html( ) -> None: """ 📄 Convert a single CSV file to HTML report. - + This command converts a CSV file to an HTML table format, useful for viewing performance metrics in a web browser. - + Examples: madengine report to-html --csv-file-path perf_amd.csv madengine report to-html --csv-file-path results/perf_mi300.csv @@ -71,29 +67,37 @@ def to_html( # Validate input if not os.path.exists(csv_file_path): - console.print(f"❌ [bold red]Error: CSV file not found: {csv_file_path}[/bold red]") + console.print( + f"❌ [bold red]Error: CSV file not found: {csv_file_path}[/bold red]" + ) raise typer.Exit(ExitCode.FAILURE) - + if not os.path.isfile(csv_file_path): - console.print(f"❌ [bold red]Error: Path is not a file: {csv_file_path}[/bold red]") + console.print( + f"❌ [bold red]Error: Path is not a file: {csv_file_path}[/bold red]" + ) raise typer.Exit(ExitCode.FAILURE) - - if not csv_file_path.endswith('.csv'): - console.print(f"❌ [bold red]Error: File must be a CSV file: {csv_file_path}[/bold red]") + + if not csv_file_path.endswith(".csv"): + console.print( + f"❌ [bold red]Error: File must be a CSV file: {csv_file_path}[/bold red]" + ) raise typer.Exit(ExitCode.FAILURE) try: # Create args namespace for compatibility with existing code args = create_args_namespace(csv_file_path=csv_file_path) - + # Use ConvertCsvToHtml class converter = ConvertCsvToHtml(args=args) result = converter.run() - + if result: # Determine output file name - output_file = str(Path(csv_file_path).with_suffix('.html')) - console.print(f"✅ [bold green]Successfully converted to: {output_file}[/bold green]") + output_file = str(Path(csv_file_path).with_suffix(".html")) + console.print( + f"✅ [bold green]Successfully converted to: {output_file}[/bold green]" + ) else: console.print("❌ [bold red]Conversion failed[/bold red]") raise typer.Exit(ExitCode.FAILURE) @@ -112,16 +116,12 @@ def to_email( typer.Option( "--directory", "--dir", - help="Path to directory containing CSV files to consolidate" + help="Path to directory containing CSV files to consolidate", ), ] = ".", output: Annotated[ str, - typer.Option( - "--output", - "-o", - help="Output HTML filename" - ), + typer.Option("--output", "-o", help="Output HTML filename"), ] = "run_results.html", verbose: Annotated[ bool, typer.Option("--verbose", "-v", help="Enable verbose logging") @@ -129,10 +129,10 @@ def to_email( ) -> None: """ 📧 Convert all CSV files in a directory to consolidated email-ready HTML report. - + This command scans a directory for CSV files and combines them into a single HTML report with sections for each CSV file, suitable for email distribution. - + Examples: madengine report to-email madengine report to-email --directory ./results @@ -152,26 +152,36 @@ def to_email( # Validate input if not os.path.exists(directory): - console.print(f"❌ [bold red]Error: Directory not found: {directory}[/bold red]") + console.print( + f"❌ [bold red]Error: Directory not found: {directory}[/bold red]" + ) raise typer.Exit(ExitCode.FAILURE) - + if not os.path.isdir(directory): - console.print(f"❌ [bold red]Error: Path is not a directory: {directory}[/bold red]") - console.print(f"💡 [cyan]Tip: Use 'to-html' command for single CSV files[/cyan]") + console.print( + f"❌ [bold red]Error: Path is not a directory: {directory}[/bold red]" + ) + console.print( + f"💡 [cyan]Tip: Use 'to-html' command for single CSV files[/cyan]" + ) raise typer.Exit(ExitCode.FAILURE) try: # Create args namespace for compatibility with existing code # The old code expects 'csv_file_path' to be the directory args = create_args_namespace(csv_file_path=directory, output_file=output) - + # Use ConvertCsvToEmail class converter = ConvertCsvToEmail(args=args) result = converter.run() - + if result: - output_path = os.path.join(directory, output) if directory != "." else output - console.print(f"✅ [bold green]Successfully generated email report: {output_path}[/bold green]") + output_path = ( + os.path.join(directory, output) if directory != "." else output + ) + console.print( + f"✅ [bold green]Successfully generated email report: {output_path}[/bold green]" + ) else: console.print("⚠️ [yellow]No CSV files found to process[/yellow]") @@ -186,4 +196,3 @@ def to_email( def report() -> typer.Typer: """Return the report sub-app.""" return report_app - diff --git a/src/madengine/cli/commands/run.py b/src/madengine/cli/commands/run.py index 0efc46e8..a23ac280 100644 --- a/src/madengine/cli/commands/run.py +++ b/src/madengine/cli/commands/run.py @@ -81,6 +81,17 @@ def run( help="File containing additional context JSON", ), ] = None, + config: Annotated[ + Optional[List[str]], + typer.Option( + "--config", + help=( + "YAML config file and/or Hydra overrides " + "(e.g., --config my_job.yaml, --config scheduler=slurm --config launcher=torchrun). " + "Cannot be combined with --additional-context or --additional-context-file." + ), + ), + ] = None, keep_alive: Annotated[ bool, typer.Option("--keep-alive", help="Keep Docker containers alive after run"), @@ -164,6 +175,39 @@ def run( # Process tags to handle comma-separated values processed_tags = split_comma_separated_tags(tags) + # --config is mutually exclusive with --additional-context and --additional-context-file + if config: + if additional_context and additional_context.strip() not in ("", "{}"): + console.print( + "[red]Error:[/red] --config cannot be used together with --additional-context. " + "Use one or the other.", + style="bold", + ) + raise typer.Exit(code=ExitCode.INVALID_ARGS.value) + if additional_context_file: + console.print( + "[red]Error:[/red] --config cannot be used together with --additional-context-file. " + "Use one or the other.", + style="bold", + ) + raise typer.Exit(code=ExitCode.INVALID_ARGS.value) + + from madengine.config import load_config + + config_ctx, config_meta = load_config(config) + + if not processed_tags and config_meta.get("model", {}).get("tags"): + processed_tags = config_meta["model"]["tags"] + if timeout == DEFAULT_TIMEOUT and config_meta.get("model", {}).get("timeout"): + timeout = config_meta["model"]["timeout"] + if not manifest_file and config_meta.get("model", {}).get("manifest_file"): + manifest_file = config_meta["model"]["manifest_file"] + if not registry and config_meta.get("build", {}).get("registry"): + registry = config_meta["build"]["registry"] + + additional_context = repr(config_ctx) + additional_context_file = None + # Input validation if timeout < -1: console.print( @@ -247,7 +291,7 @@ def run( task = progress.add_task( "Initializing execution orchestrator...", total=None ) - + # Use new RunOrchestrator orchestrator = RunOrchestrator(args) progress.update(task, description="Running models...") @@ -262,23 +306,29 @@ def run( # Display results summary display_results_table(execution_summary, "Execution Results") - + # Display detailed performance metrics from CSV (show all historical runs, mark current ones) perf_csv_path = getattr(args, "output", DEFAULT_PERF_OUTPUT) session_start_row = execution_summary.get("session_start_row") display_performance_table(perf_csv_path, session_start_row) - + # Cleanup session marker AFTER display (so display functions can use it) from madengine.utils.session_tracker import SessionTracker + tracker = SessionTracker(perf_csv_path) tracker.cleanup_marker() - + # Cleanup intermediate perf files if requested if cleanup_perf: - from madengine.utils.perf_cleanup import cleanup_perf_intermediates as do_cleanup - console.print("\n🧹 [cyan]Cleaning up intermediate performance files...[/cyan]") + from madengine.utils.perf_cleanup import ( + cleanup_perf_intermediates as do_cleanup, + ) + + console.print( + "\n🧹 [cyan]Cleaning up intermediate performance files...[/cyan]" + ) do_cleanup() - + save_summary_with_feedback(execution_summary, summary_output, "Execution") failed_runs = len(execution_summary.get("failed_runs", [])) @@ -351,10 +401,10 @@ def run( task = progress.add_task( "Initializing workflow orchestrator...", total=None ) - + # Use new RunOrchestrator (handles build+run automatically when tags provided) orchestrator = RunOrchestrator(args) - + progress.update(task, description="Building and running models...") execution_summary = orchestrator.execute( manifest_file=None, # Triggers build phase @@ -365,7 +415,7 @@ def run( progress.update(task, description="Workflow completed!") # Load build summary from generated manifest - with open(manifest_output, 'r') as f: + with open(manifest_output, "r") as f: manifest = json.load(f) build_summary = manifest.get("summary", {}) @@ -382,23 +432,29 @@ def run( # Display results display_results_table(build_summary, "Build Results") display_results_table(execution_summary, "Execution Results") - + # Display detailed performance metrics from CSV (show all historical runs, mark current ones) perf_csv_path = getattr(args, "output", DEFAULT_PERF_OUTPUT) session_start_row = execution_summary.get("session_start_row") display_performance_table(perf_csv_path, session_start_row) - + # Cleanup session marker AFTER display (so display functions can use it) from madengine.utils.session_tracker import SessionTracker + tracker = SessionTracker(perf_csv_path) tracker.cleanup_marker() - + # Cleanup intermediate perf files if requested if cleanup_perf: - from madengine.utils.perf_cleanup import cleanup_perf_intermediates as do_cleanup - console.print("\n🧹 [cyan]Cleaning up intermediate performance files...[/cyan]") + from madengine.utils.perf_cleanup import ( + cleanup_perf_intermediates as do_cleanup, + ) + + console.print( + "\n🧹 [cyan]Cleaning up intermediate performance files...[/cyan]" + ) do_cleanup() - + save_summary_with_feedback(workflow_summary, summary_output, "Workflow") if workflow_summary["overall_success"]: @@ -435,41 +491,39 @@ def run( except ExecutionError as e: # Runtime execution errors console.print(f"💥 [bold red]Runtime error: {e}[/bold red]") - if hasattr(e, 'suggestions') and e.suggestions: + if hasattr(e, "suggestions") and e.suggestions: console.print("\n💡 [cyan]Suggestions:[/cyan]") for suggestion in e.suggestions: console.print(f" • {suggestion}") raise typer.Exit(ExitCode.RUN_FAILURE) - + except ConfigurationError as e: # Configuration errors console.print(f"⚙️ [bold red]Configuration error: {e}[/bold red]") - if hasattr(e, 'suggestions') and e.suggestions: + if hasattr(e, "suggestions") and e.suggestions: console.print("\n💡 [cyan]Suggestions:[/cyan]") for suggestion in e.suggestions: console.print(f" • {suggestion}") raise typer.Exit(ExitCode.INVALID_ARGS) - + except KeyboardInterrupt: console.print("\n🛑 [yellow]Run cancelled by user[/yellow]") raise typer.Exit(ExitCode.FAILURE) - + except FileNotFoundError as e: console.print(f"📁 [bold red]File not found: {e}[/bold red]") console.print("💡 Check manifest file path and required files") raise typer.Exit(ExitCode.FAILURE) - + except Exception as e: console.print(f"💥 [bold red]Run process failed: {e}[/bold red]") if verbose: console.print_exception() - + from madengine.core.errors import handle_error, create_error_context + context = create_error_context( - operation="run", - phase="run", - component="run_command" + operation="run", phase="run", component="run_command" ) handle_error(e, context=context) raise typer.Exit(ExitCode.FAILURE) - diff --git a/src/madengine/cli/constants.py b/src/madengine/cli/constants.py index b437fa30..e80eb5c1 100644 --- a/src/madengine/cli/constants.py +++ b/src/madengine/cli/constants.py @@ -29,4 +29,3 @@ class ExitCode(IntEnum): DEFAULT_DATA_CONFIG = "data.json" DEFAULT_TOOLS_CONFIG = "./scripts/common/tools.json" DEFAULT_TIMEOUT = -1 - diff --git a/src/madengine/cli/utils.py b/src/madengine/cli/utils.py index 75e026b7..d7750497 100644 --- a/src/madengine/cli/utils.py +++ b/src/madengine/cli/utils.py @@ -50,26 +50,26 @@ def setup_logging(verbose: bool = False) -> None: def split_comma_separated_tags(tags: List[str]) -> List[str]: """Split comma-separated tags into individual tags. - + Handles both formats: - Multiple flags: --tags dummy --tags multi → ['dummy', 'multi'] - Comma-separated: --tags dummy,multi → ['dummy', 'multi'] - + Args: tags: List of tag strings (may contain comma-separated values) - + Returns: List of individual tag strings """ if not tags: return [] - + processed_tags = [] for tag in tags: # Split by comma and strip whitespace - split_tags = [t.strip() for t in tag.split(',') if t.strip()] + split_tags = [t.strip() for t in tag.split(",") if t.strip()] processed_tags.extend(split_tags) - + return processed_tags @@ -100,31 +100,33 @@ def save_summary_with_feedback( raise typer.Exit(ExitCode.FAILURE) -def display_results_table(summary: Dict, title: str, show_gpu_arch: bool = False) -> None: +def display_results_table( + summary: Dict, title: str, show_gpu_arch: bool = False +) -> None: """ Display results in a formatted table. - + Automatically detects: - BUILD results: Simple format (no nodes/performance) - RUN results with nodes: Enhanced per-node breakdown """ successful = summary.get("successful_builds", summary.get("successful_runs", [])) failed = summary.get("failed_builds", summary.get("failed_runs", [])) - + # Detect if this is a RUN result with per-node data (vs BUILD result) has_node_data = False for item in successful + failed: if isinstance(item, dict) and ("nodes" in item or "perf_data" in item): has_node_data = True break - + # Create table with appropriate columns based on result type if has_node_data: # RUN results - enhanced format with per-node breakdown table = Table( - title=f"⚡ {title} (Per-Node Breakdown)", - show_header=True, - header_style="bold magenta" + title=f"⚡ {title} (Per-Node Breakdown)", + show_header=True, + header_style="bold magenta", ) table.add_column("Index", justify="right", style="dim") table.add_column("Status", style="bold") @@ -135,14 +137,12 @@ def display_results_table(summary: Dict, title: str, show_gpu_arch: bool = False else: # BUILD results - simple format (no per-node data) table = Table( - title=f"⚡ {title}", - show_header=True, - header_style="bold magenta" + title=f"⚡ {title}", show_header=True, header_style="bold magenta" ) table.add_column("Index", justify="right", style="dim") table.add_column("Status", style="bold") table.add_column("Model", style="cyan") - + # Add GPU Architecture column if multi-arch build was used if show_gpu_arch: table.add_column("GPU Architecture", style="blue") @@ -151,11 +151,7 @@ def build_gpu_arch_display(item: Dict) -> str: """Prefer gpu_architecture (DockerBuilder) then architecture (failures / legacy).""" if not isinstance(item, dict): return "N/A" - return ( - item.get("gpu_architecture") - or item.get("architecture") - or "N/A" - ) + return item.get("gpu_architecture") or item.get("architecture") or "N/A" # Helper function to extract model name from build result def extract_model_name(item): @@ -171,7 +167,11 @@ def extract_model_name(item): docker_image = item["docker_image"] if docker_image.startswith("ci-"): parts = docker_image[3:].split("_") - model_name = parts[0] if len(parts) >= 2 else (parts[0] if parts else docker_image) + model_name = ( + parts[0] + if len(parts) >= 2 + else (parts[0] if parts else docker_image) + ) else: model_name = docker_image return model_name @@ -189,13 +189,13 @@ def format_number(value): # Add successful builds/runs row_index = 1 job_summaries = [] # For final summary line - + for item in successful: if isinstance(item, dict): model_name = extract_model_name(item) nodes = item.get("nodes", []) perf_data = item.get("perf_data", {}) - + if has_node_data: # RUN results - show per-node breakdown if not nodes: @@ -204,49 +204,69 @@ def format_number(value): node_str = "node-0" perf = perf_data.get("performance", "-") metric = perf_data.get("metric", "-") - - row = [str(row_index), status, model_name, node_str, format_number(perf), metric] + + row = [ + str(row_index), + status, + model_name, + node_str, + format_number(perf), + metric, + ] if show_gpu_arch: row.append(perf_data.get("gpu_architecture", "N/A")) table.add_row(*row) row_index += 1 - - job_summaries.append({ - "model": model_name, - "nodes_succeeded": 1, - "nodes_total": 1, - "aggregated_perf": perf, - "metric": metric - }) + + job_summaries.append( + { + "model": model_name, + "nodes_succeeded": 1, + "nodes_total": 1, + "aggregated_perf": perf, + "metric": metric, + } + ) else: # Multi-node - show all nodes aggregated_perf = perf_data.get("performance") aggregated_metric = perf_data.get("metric") - - nodes_succeeded = sum(1 for n in nodes if n.get("status") == "SUCCESS") - + + nodes_succeeded = sum( + 1 for n in nodes if n.get("status") == "SUCCESS" + ) + for node in nodes: status_icon = "✅" if node.get("status") == "SUCCESS" else "❌" status = f"{status_icon} {node.get('status')}" node_str = f"node-{node['node_id']}" - + # Show node-local performance perf = node.get("performance", "-") metric = node.get("metric", "-") - - row = [str(row_index), status, model_name, node_str, format_number(perf) if perf != "-" else "-", metric if metric else "-"] + + row = [ + str(row_index), + status, + model_name, + node_str, + format_number(perf) if perf != "-" else "-", + metric if metric else "-", + ] if show_gpu_arch: row.append(perf_data.get("gpu_architecture", "N/A")) table.add_row(*row) row_index += 1 - - job_summaries.append({ - "model": model_name, - "nodes_succeeded": nodes_succeeded, - "nodes_total": len(nodes), - "aggregated_perf": aggregated_perf, - "metric": aggregated_metric - }) + + job_summaries.append( + { + "model": model_name, + "nodes_succeeded": nodes_succeeded, + "nodes_total": len(nodes), + "aggregated_perf": aggregated_perf, + "metric": aggregated_metric, + } + ) else: # BUILD results - simple format (no node/performance columns) status = "✅ Success" @@ -272,12 +292,19 @@ def format_number(value): if isinstance(item, dict): model_name = item.get("model", "Unknown") nodes = item.get("nodes", []) - + if has_node_data: # RUN results - show per-node failures if not nodes: # Single failure - row = [str(row_index), "❌ Failed", model_name, "node-0", "-", item.get("error", "Unknown")] + row = [ + str(row_index), + "❌ Failed", + model_name, + "node-0", + "-", + item.get("error", "Unknown"), + ] if show_gpu_arch: row.append(item.get("architecture", "N/A")) table.add_row(*row) @@ -289,7 +316,14 @@ def format_number(value): status = f"{status_icon} {node.get('status', 'FAILED')}" node_str = f"node-{node['node_id']}" error = node.get("error", "-") - row = [str(row_index), status, model_name, node_str, "-", error if error else "-"] + row = [ + str(row_index), + status, + model_name, + node_str, + "-", + error if error else "-", + ] if show_gpu_arch: row.append("N/A") table.add_row(*row) @@ -322,7 +356,7 @@ def format_number(value): table.add_row(*row) console.print(table) - + # Print job-level summaries for multi-node jobs (RUN results only) if has_node_data and job_summaries: console.print("\n💡 [bold]Job Summary:[/bold]") @@ -338,11 +372,13 @@ def format_number(value): ) -def display_performance_table(perf_csv_path: str = "perf.csv", session_start_row: int = None) -> None: +def display_performance_table( + perf_csv_path: str = "perf.csv", session_start_row: int = None +) -> None: """Display performance metrics from perf.csv file. - + Shows all historical runs with visual markers for current session runs. - + Args: perf_csv_path: Path to the performance CSV file session_start_row: Optional row number to filter from (for current session only) @@ -350,40 +386,40 @@ def display_performance_table(perf_csv_path: str = "perf.csv", session_start_row if not os.path.exists(perf_csv_path): console.print(f"[yellow]⚠️ Performance CSV not found: {perf_csv_path}[/yellow]") return - + try: import pandas as pd from madengine.utils.session_tracker import SessionTracker - + # Read CSV file df = pd.read_csv(perf_csv_path) - + if df.empty: console.print("[yellow]⚠️ Performance CSV is empty[/yellow]") return - + total_rows = len(df) - + # Try parameter first, then fall back to marker file if session_start_row is None: - session_start_row = SessionTracker.load_session_marker_for_csv(perf_csv_path) - + session_start_row = SessionTracker.load_session_marker_for_csv( + perf_csv_path + ) + # Count current session runs for title if session_start_row is not None and session_start_row < total_rows: current_run_count = total_rows - session_start_row title = f"📊 Performance Results (all {total_rows} runs, {current_run_count} from current session)" else: title = f"📊 Performance Results (all {total_rows} runs)" - + # Create performance table - perf_table = Table( - title=title, - show_header=True, - header_style="bold magenta" - ) - + perf_table = Table(title=title, show_header=True, header_style="bold magenta") + # Add columns (with "Run" marker column as first column) - perf_table.add_column("Run", justify="center", width=4) # Marker column for current session + perf_table.add_column( + "Run", justify="center", width=4 + ) # Marker column for current session perf_table.add_column("Index", justify="right", style="dim") perf_table.add_column("Model", style="cyan") perf_table.add_column("Topology", justify="center", style="blue") @@ -395,8 +431,8 @@ def display_performance_table(perf_csv_path: str = "perf.csv", session_start_row perf_table.add_column("Status", style="bold") perf_table.add_column("Duration", justify="right", style="blue", min_width=8) perf_table.add_column("Data Name", style="magenta") - perf_table.add_column("Data Provider", style="magenta") - + perf_table.add_column("Data Provider", style="magenta") + # Helper function to format duration (accepts float seconds or "Xs" string) def format_duration(duration): if pd.isna(duration) or duration == "" or duration is None: @@ -414,7 +450,7 @@ def format_duration(duration): return f"{dur/60:.1f}m" except (ValueError, TypeError): return str(duration) if duration else "N/A" - + # Helper function to format performance def format_performance(perf): if pd.isna(perf) or perf == "": @@ -431,48 +467,80 @@ def format_performance(perf): return f"{val:.4g}" except (ValueError, TypeError): return str(perf) - + # Add rows from dataframe for idx, row in df.iterrows(): # Determine if this is a current session run - is_current_run = (session_start_row is not None and idx >= session_start_row) - run_marker = "[bold green]➤[/]" if is_current_run else "" # Arrow marker for current runs - + is_current_run = session_start_row is not None and idx >= session_start_row + run_marker = ( + "[bold green]➤[/]" if is_current_run else "" + ) # Arrow marker for current runs + model_val = row.get("model", "Unknown") model = ( "Unknown" - if (pd.isna(model_val) or model_val == "" or str(model_val).strip() == "nan") + if ( + pd.isna(model_val) + or model_val == "" + or str(model_val).strip() == "nan" + ) else str(model_val) ) - dataname = str(row.get("dataname", "")) if not pd.isna(row.get("dataname")) and row.get("dataname") != "" else "N/A" - data_provider_type = str(row.get("data_provider_type", "")) if not pd.isna(row.get("data_provider_type")) and row.get("data_provider_type") != "" else "N/A" - + dataname = ( + str(row.get("dataname", "")) + if not pd.isna(row.get("dataname")) and row.get("dataname") != "" + else "N/A" + ) + data_provider_type = ( + str(row.get("data_provider_type", "")) + if not pd.isna(row.get("data_provider_type")) + and row.get("data_provider_type") != "" + else "N/A" + ) + # Format topology: Always show "NxG" format for consistency # Examples: "1N×1G" (single node, single GPU), "1N×4G" (single node, 4 GPUs), "2N×2G" (2 nodes, 2 GPUs each) n_gpus = row.get("n_gpus", 1) nnodes = row.get("nnodes", 1) gpus_per_node = row.get("gpus_per_node", n_gpus) - + # Determine topology display format try: - nnodes_int = int(nnodes) if not pd.isna(nnodes) and str(nnodes) != "" else 1 - gpus_per_node_int = int(gpus_per_node) if not pd.isna(gpus_per_node) and str(gpus_per_node) != "" else int(n_gpus) if not pd.isna(n_gpus) else 1 - + nnodes_int = ( + int(nnodes) if not pd.isna(nnodes) and str(nnodes) != "" else 1 + ) + gpus_per_node_int = ( + int(gpus_per_node) + if not pd.isna(gpus_per_node) and str(gpus_per_node) != "" + else int(n_gpus) if not pd.isna(n_gpus) else 1 + ) + # Always show NxG format for consistency topology = f"{nnodes_int}N×{gpus_per_node_int}G" except (ValueError, TypeError): # Fallback if parsing fails topology = "N/A" - + # Get launcher value as-is from the CSV (don't default to "docker" here) - launcher = str(row.get("launcher", "")) if not pd.isna(row.get("launcher")) and row.get("launcher") != "" else "N/A" - deployment_type = str(row.get("deployment_type", "local")) if not pd.isna(row.get("deployment_type")) and row.get("deployment_type") != "" else "local" + launcher = ( + str(row.get("launcher", "")) + if not pd.isna(row.get("launcher")) and row.get("launcher") != "" + else "N/A" + ) + deployment_type = ( + str(row.get("deployment_type", "local")) + if not pd.isna(row.get("deployment_type")) + and row.get("deployment_type") != "" + else "local" + ) gpu_arch = str(row.get("gpu_architecture", "N/A")) performance = format_performance(row.get("performance", "")) - metric = str(row.get("metric", "")) if not pd.isna(row.get("metric")) else "" - + metric = ( + str(row.get("metric", "")) if not pd.isna(row.get("metric")) else "" + ) + status = str(row.get("status", "UNKNOWN")) - + # Duration column shows ONLY test/execution time (not build time) # If test_duration is missing, show N/A test_dur = row.get("test_duration", "") @@ -480,7 +548,7 @@ def format_performance(perf): duration = format_duration(test_dur) else: duration = "N/A" - + # Color-code status if status == "SUCCESS": status_display = "✅ Success" @@ -488,13 +556,13 @@ def format_performance(perf): status_display = "❌ Failed" else: status_display = f"⚠️ {status}" - + perf_table.add_row( - run_marker, # Marker column showing ➤ for current runs + run_marker, # Marker column showing ➤ for current runs str(idx), model, topology, - launcher, # Distributed launcher (docker, torchrun, vllm, etc.) + launcher, # Distributed launcher (docker, torchrun, vllm, etc.) deployment_type, gpu_arch, performance, @@ -502,24 +570,27 @@ def format_performance(perf): status_display, duration, dataname, - data_provider_type + data_provider_type, ) - + console.print() # Add blank line console.print(perf_table) - + # Print summary statistics total_runs = len(df) successful_runs = len(df[df["status"] == "SUCCESS"]) failed_runs = len(df[df["status"] == "FAILURE"]) - + console.print() - console.print(f"[bold]Summary:[/bold] {total_runs} total runs, " - f"[green]{successful_runs} successful[/green], " - f"[red]{failed_runs} failed[/red]") - + console.print( + f"[bold]Summary:[/bold] {total_runs} total runs, " + f"[green]{successful_runs} successful[/green], " + f"[red]{failed_runs} failed[/red]" + ) + except ImportError: - console.print("[yellow]⚠️ pandas not installed. Install with: pip install pandas[/yellow]") + console.print( + "[yellow]⚠️ pandas not installed. Install with: pip install pandas[/yellow]" + ) except Exception as e: console.print(f"[red]❌ Error reading performance CSV: {e}[/red]") - diff --git a/src/madengine/cli/validators.py b/src/madengine/cli/validators.py index 1f7ee001..820248e0 100644 --- a/src/madengine/cli/validators.py +++ b/src/madengine/cli/validators.py @@ -14,22 +14,20 @@ import typer from rich.console import Console -from madengine.utils.discover_models import DiscoverModels from madengine.core.additional_context_defaults import ( DEFAULT_GPU_VENDOR, DEFAULT_GUEST_OS, apply_build_context_defaults, ) -from .constants import ExitCode, VALID_GPU_VENDORS, VALID_GUEST_OS -from .utils import create_args_namespace +from madengine.utils.discover_models import DiscoverModels +from .constants import VALID_GPU_VENDORS, VALID_GUEST_OS, ExitCode +from .utils import create_args_namespace # Initialize Rich console console = Console() -_EXAMPLE_ADDITIONAL_CONTEXT = ( - '--additional-context \'{"docker_build_arg": {"MAD_SYSTEM_GPU_ARCHITECTURE": "gfx942"}}\'' -) +_EXAMPLE_ADDITIONAL_CONTEXT = '--additional-context \'{"docker_build_arg": {"MAD_SYSTEM_GPU_ARCHITECTURE": "gfx942"}}\'' def parse_additional_context_cli_string(additional_context: str) -> Dict[str, Any]: @@ -44,9 +42,7 @@ def parse_additional_context_cli_string(additional_context: str) -> Dict[str, An try: parsed = ast.literal_eval(additional_context) except (ValueError, SyntaxError) as e: - console.print( - f"❌ Invalid additional_context format: [red]{e}[/red]" - ) + console.print(f"❌ Invalid additional_context format: [red]{e}[/red]") console.print( "💡 Use JSON or a Python dict literal, e.g. " + _EXAMPLE_ADDITIONAL_CONTEXT @@ -194,9 +190,7 @@ def validate_additional_context_structure(context: Dict[str, Any]) -> None: if "log_error_benign_patterns" in context: lebp = context["log_error_benign_patterns"] - if not isinstance(lebp, list) or not all( - isinstance(x, str) for x in lebp - ): + if not isinstance(lebp, list) or not all(isinstance(x, str) for x in lebp): _fail_structure( "log_error_benign_patterns", "an array of strings", @@ -204,8 +198,10 @@ def validate_additional_context_structure(context: Dict[str, Any]) -> None: if "log_error_patterns" in context: lep = context["log_error_patterns"] - if not isinstance(lep, list) or not lep or not all( - isinstance(x, str) for x in lep + if ( + not isinstance(lep, list) + or not lep + or not all(isinstance(x, str) for x in lep) ): _fail_structure( "log_error_patterns", diff --git a/src/madengine/config/__init__.py b/src/madengine/config/__init__.py new file mode 100644 index 00000000..076a66ea --- /dev/null +++ b/src/madengine/config/__init__.py @@ -0,0 +1,25 @@ +"""Config-driven YAML configuration system for madengine.""" + +from madengine.config.loader import HydraConfigLoader +from madengine.config.schema import ConfigValidator +from madengine.config.translator import ConfigTranslator + + +def load_config(config_args: list) -> tuple: + """Load config from Hydra overrides and/or user YAML file. + + Args: + config_args: List of Hydra overrides and/or a YAML file path. + + Returns: + Tuple of (additional_context dict, metadata dict). + """ + cfg = HydraConfigLoader.load(config_args) + errors = ConfigValidator.validate(cfg) + if errors: + from madengine.core.errors import ConfigurationError + + raise ConfigurationError( + "Config validation errors:\n" + "\n".join(f" - {e}" for e in errors) + ) + return ConfigTranslator.to_additional_context(cfg) diff --git a/src/madengine/config/loader.py b/src/madengine/config/loader.py new file mode 100644 index 00000000..5925065d --- /dev/null +++ b/src/madengine/config/loader.py @@ -0,0 +1,62 @@ +"""Hydra-based config loader using the Compose API.""" + +import importlib.resources +import os +from pathlib import Path + +from hydra import compose, initialize_config_dir +from hydra.core.global_hydra import GlobalHydra +from omegaconf import DictConfig, OmegaConf + +from madengine.core.errors import ConfigurationError + + +class HydraConfigLoader: + """Loads madengine config using Hydra's Compose API.""" + + @staticmethod + def load(config_args: list) -> DictConfig: + """Load and compose config from Hydra overrides and/or user YAML. + + Args: + config_args: Mix of Hydra overrides and optional user YAML path. + + Returns: + Composed DictConfig with all merges applied. + """ + user_file, overrides = HydraConfigLoader._parse_args(config_args) + + config_dir = str(Path(importlib.resources.files("madengine")) / "configs") # type: ignore[attr-defined] + + if not os.path.isdir(config_dir): + config_dir = str(Path(__file__).parent.parent / "configs") + + GlobalHydra.instance().clear() + + with initialize_config_dir(config_dir=config_dir, version_base=None): + cfg = compose(config_name="config", overrides=overrides) + + if user_file: + user_cfg = OmegaConf.load(user_file) + OmegaConf.set_struct(cfg, False) + cfg = OmegaConf.merge(cfg, user_cfg) + + return cfg + + @staticmethod + def _parse_args(config_args: list) -> tuple: + """Separate user YAML file path from Hydra overrides.""" + user_file = None + overrides = [] + for arg in config_args: + if ( + arg.endswith((".yaml", ".yml")) + and "=" not in arg + and not arg.startswith("+") + ): + if user_file: + raise ConfigurationError("Only one YAML config file allowed") + user_file = arg + else: + overrides.append(arg) + return user_file, overrides diff --git a/src/madengine/config/schema.py b/src/madengine/config/schema.py new file mode 100644 index 00000000..358d1c9d --- /dev/null +++ b/src/madengine/config/schema.py @@ -0,0 +1,84 @@ +"""Config validation for composed Hydra configs.""" + +from omegaconf import DictConfig, OmegaConf + +KNOWN_TOP_LEVEL_KEYS = { + "defaults", + "platform", + "scheduler", + "hardware", + "launcher", + "model", + "docker", + "build", + "env_vars", + "debug", + "live_output", + "log_error", + "tools", + "pre_scripts", + "post_scripts", + "encapsulate_script", + "data_config", + "output", + "summary_output", + "gpu_vendor", + "guest_os", + "runtime", + "slurm", + "k8s", + "kubernetes", + "distributed", + "vllm", + "sglang_disagg", + "shared_data", + "timeout", + "gpu_type", + "gpu_memory_gb", + "gpus_per_node", + "data", +} + +SUPPORTED_PLATFORMS = {"docker"} + + +class ConfigValidator: + """Validates composed config for consistency.""" + + @staticmethod + def validate(cfg: DictConfig) -> list: + """Return list of validation errors (empty = valid).""" + errors = [] + + raw = ( + OmegaConf.to_container(cfg, resolve=False) + if isinstance(cfg, DictConfig) + else {} + ) + + if raw.get("slurm") and raw.get("k8s"): + errors.append("Cannot specify both 'slurm' and 'k8s' sections") + + dist = raw.get("distributed") + if isinstance(dist, dict): + if dist.get("enabled") and not dist.get("launcher"): + errors.append("distributed.enabled=true requires distributed.launcher") + nnodes = dist.get("nnodes") + if nnodes is not None: + if not isinstance(nnodes, int) or nnodes < 1: + errors.append("distributed.nnodes must be a positive integer") + + platform = raw.get("platform") + if isinstance(platform, dict): + ptype = platform.get("type") + if ptype and ptype not in SUPPORTED_PLATFORMS: + errors.append( + f"Platform '{ptype}' is not yet supported. " + f"Supported: {', '.join(sorted(SUPPORTED_PLATFORMS))}" + ) + + for key in raw: + if key not in KNOWN_TOP_LEVEL_KEYS: + errors.append(f"Unknown config key: '{key}'") + + return errors diff --git a/src/madengine/config/translator.py b/src/madengine/config/translator.py new file mode 100644 index 00000000..f0448a8e --- /dev/null +++ b/src/madengine/config/translator.py @@ -0,0 +1,75 @@ +"""Translates clean YAML config to internal additional_context format.""" + +from omegaconf import DictConfig, OmegaConf + + +class ConfigTranslator: + """Maps YAML config keys to internal additional_context dict format.""" + + KEY_MAP = { + "docker.build_args": "docker_build_arg", + "docker.env_vars": "docker_env_vars", + "docker.mounts": "docker_mounts", + "docker.gpus": "docker_gpus", + "docker.cpus": "docker_cpus", + "docker.additional_run_options": "additional_docker_run_options", + "log_error.pattern_scan": "log_error_pattern_scan", + "log_error.benign_patterns": "log_error_benign_patterns", + "log_error.patterns": "log_error_patterns", + } + + EXTRACTED_KEYS = { + "model", + "build", + "platform", + "output", + "summary_output", + "data_config", + "live_output", + } + + @classmethod + def to_additional_context(cls, cfg: DictConfig) -> tuple: + """Convert DictConfig to (additional_context, metadata) tuple. + + Returns: + additional_context: dict in the format expected by existing pipeline. + metadata: dict with model.tags, build.registry, etc. for the CLI layer. + """ + raw = OmegaConf.to_container(cfg, resolve=True) + + context = {} + metadata = {} + + for key, value in raw.items(): + if key in cls.EXTRACTED_KEYS: + metadata[key] = value + elif key == "docker": + for subkey, subval in value.items(): + internal_key = cls.KEY_MAP.get( + f"docker.{subkey}", f"docker_{subkey}" + ) + if subval is None: + continue + if isinstance(subval, dict) and not subval: + continue + context[internal_key] = subval + elif key == "log_error": + for subkey, subval in value.items(): + internal_key = cls.KEY_MAP.get( + f"log_error.{subkey}", f"log_error_{subkey}" + ) + if isinstance(subval, list) and not subval: + continue + context[internal_key] = subval + elif key == "runtime": + metadata["runtime"] = value + else: + if value is not None: + context[key] = value + + model = metadata.get("model", {}) + if model and model.get("container_image"): + context["MAD_CONTAINER_IMAGE"] = model["container_image"] + + return context, metadata diff --git a/src/madengine/configs/.gitkeep b/src/madengine/configs/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/src/madengine/configs/build/ci.yaml b/src/madengine/configs/build/ci.yaml new file mode 100644 index 00000000..51eccec3 --- /dev/null +++ b/src/madengine/configs/build/ci.yaml @@ -0,0 +1,8 @@ +# @package _global_ +docker: + clean_cache: true + +build: + registry: null + target_archs: [] + manifest_output: build_manifest.json diff --git a/src/madengine/configs/build/default.yaml b/src/madengine/configs/build/default.yaml new file mode 100644 index 00000000..62deae2a --- /dev/null +++ b/src/madengine/configs/build/default.yaml @@ -0,0 +1,5 @@ +# @package _global_ +build: + registry: null + target_archs: [] + manifest_output: build_manifest.json diff --git a/src/madengine/configs/build/multi_arch.yaml b/src/madengine/configs/build/multi_arch.yaml new file mode 100644 index 00000000..67001f76 --- /dev/null +++ b/src/madengine/configs/build/multi_arch.yaml @@ -0,0 +1,8 @@ +# @package _global_ +build: + registry: null + target_archs: + - gfx942 + - gfx90a + - gfx908 + manifest_output: build_manifest.json diff --git a/src/madengine/configs/config.yaml b/src/madengine/configs/config.yaml new file mode 100644 index 00000000..168647eb --- /dev/null +++ b/src/madengine/configs/config.yaml @@ -0,0 +1,48 @@ +defaults: + - platform: docker + - scheduler: local + - hardware: amd + - launcher: none + - _self_ + +model: + tags: [] + manifest_file: null + container_image: null + skip_run: false + timeout: null + +docker: + build_args: {} + env_vars: {} + mounts: {} + gpus: null + cpus: null + additional_run_options: null + keep_alive: false + clean_cache: false + +build: + registry: null + target_archs: [] + manifest_output: build_manifest.json + +env_vars: {} + +debug: false +live_output: false + +log_error: + pattern_scan: true + benign_patterns: [] + patterns: [] + +tools: [] +pre_scripts: [] +post_scripts: [] +encapsulate_script: null + +data_config: data.json + +output: perf.csv +summary_output: null diff --git a/src/madengine/configs/data/local.yaml b/src/madengine/configs/data/local.yaml new file mode 100644 index 00000000..8cdc5955 --- /dev/null +++ b/src/madengine/configs/data/local.yaml @@ -0,0 +1,4 @@ +# @package _global_ +data: + provider: local + path: null diff --git a/src/madengine/configs/data/minio.yaml b/src/madengine/configs/data/minio.yaml new file mode 100644 index 00000000..3f6ec625 --- /dev/null +++ b/src/madengine/configs/data/minio.yaml @@ -0,0 +1,7 @@ +# @package _global_ +data: + provider: minio + endpoint: null + bucket: null + access_key: null + secret_key: null diff --git a/src/madengine/configs/data/nas.yaml b/src/madengine/configs/data/nas.yaml new file mode 100644 index 00000000..e08c48e4 --- /dev/null +++ b/src/madengine/configs/data/nas.yaml @@ -0,0 +1,4 @@ +# @package _global_ +data: + provider: nas + mount_path: null diff --git a/src/madengine/configs/data/s3.yaml b/src/madengine/configs/data/s3.yaml new file mode 100644 index 00000000..a3f8a055 --- /dev/null +++ b/src/madengine/configs/data/s3.yaml @@ -0,0 +1,6 @@ +# @package _global_ +data: + provider: s3 + bucket: null + prefix: null + region: null diff --git a/src/madengine/configs/env/infiniband.yaml b/src/madengine/configs/env/infiniband.yaml new file mode 100644 index 00000000..19f87571 --- /dev/null +++ b/src/madengine/configs/env/infiniband.yaml @@ -0,0 +1,6 @@ +# @package _global_ +env_vars: + NCCL_IB_DISABLE: "0" + NCCL_IB_HCA: "mlx5_0:1,mlx5_1:1" + NCCL_SOCKET_IFNAME: ib0 + NCCL_NET_GDR_LEVEL: "3" diff --git a/src/madengine/configs/env/miopen_defaults.yaml b/src/madengine/configs/env/miopen_defaults.yaml new file mode 100644 index 00000000..05a87b3e --- /dev/null +++ b/src/madengine/configs/env/miopen_defaults.yaml @@ -0,0 +1,4 @@ +# @package _global_ +env_vars: + MIOPEN_FIND_MODE: "1" + MIOPEN_USER_DB_PATH: /tmp/.miopen diff --git a/src/madengine/configs/env/nccl_debug.yaml b/src/madengine/configs/env/nccl_debug.yaml new file mode 100644 index 00000000..5e171ec0 --- /dev/null +++ b/src/madengine/configs/env/nccl_debug.yaml @@ -0,0 +1,5 @@ +# @package _global_ +env_vars: + NCCL_DEBUG: INFO + NCCL_DEBUG_SUBSYS: "INIT,NET,GRAPH" + TORCH_DISTRIBUTED_DEBUG: DETAIL diff --git a/src/madengine/configs/env/nccl_tuned.yaml b/src/madengine/configs/env/nccl_tuned.yaml new file mode 100644 index 00000000..3d434949 --- /dev/null +++ b/src/madengine/configs/env/nccl_tuned.yaml @@ -0,0 +1,7 @@ +# @package _global_ +env_vars: + NCCL_DEBUG: WARN + TORCH_NCCL_HIGH_PRIORITY: "1" + GPU_MAX_HW_QUEUES: "2" + NCCL_TIMEOUT: "600" + TORCH_NCCL_ASYNC_ERROR_HANDLING: "1" diff --git a/src/madengine/configs/hardware/amd.yaml b/src/madengine/configs/hardware/amd.yaml new file mode 100644 index 00000000..670f6db1 --- /dev/null +++ b/src/madengine/configs/hardware/amd.yaml @@ -0,0 +1,18 @@ +# @package _global_ +gpu_vendor: AMD +guest_os: UBUNTU + +runtime: + devices: + - /dev/kfd + - /dev/dri + - /dev/infiniband + capabilities: + - SYS_PTRACE + security_opts: + - seccomp=unconfined + network_mode: host + ipc: host + groups: + - video + use_gpu_flag: false diff --git a/src/madengine/configs/hardware/cpu.yaml b/src/madengine/configs/hardware/cpu.yaml new file mode 100644 index 00000000..f08463a6 --- /dev/null +++ b/src/madengine/configs/hardware/cpu.yaml @@ -0,0 +1,12 @@ +# @package _global_ +gpu_vendor: null +guest_os: UBUNTU + +runtime: + devices: [] + capabilities: [] + security_opts: [] + network_mode: null + ipc: null + groups: [] + use_gpu_flag: false diff --git a/src/madengine/configs/hardware/nvidia.yaml b/src/madengine/configs/hardware/nvidia.yaml new file mode 100644 index 00000000..471d3467 --- /dev/null +++ b/src/madengine/configs/hardware/nvidia.yaml @@ -0,0 +1,12 @@ +# @package _global_ +gpu_vendor: NVIDIA +guest_os: UBUNTU + +runtime: + devices: [] + capabilities: [] + security_opts: [] + network_mode: host + ipc: host + groups: [] + use_gpu_flag: true diff --git a/src/madengine/configs/launcher/deepspeed.yaml b/src/madengine/configs/launcher/deepspeed.yaml new file mode 100644 index 00000000..99df001f --- /dev/null +++ b/src/madengine/configs/launcher/deepspeed.yaml @@ -0,0 +1,8 @@ +# @package _global_ +distributed: + enabled: true + launcher: deepspeed + backend: nccl + nnodes: 1 + nproc_per_node: 8 + master_port: 29500 diff --git a/src/madengine/configs/launcher/megatron.yaml b/src/madengine/configs/launcher/megatron.yaml new file mode 100644 index 00000000..0a131248 --- /dev/null +++ b/src/madengine/configs/launcher/megatron.yaml @@ -0,0 +1,8 @@ +# @package _global_ +distributed: + enabled: true + launcher: torchrun + backend: nccl + nnodes: 1 + nproc_per_node: 8 + master_port: 29500 diff --git a/src/madengine/configs/launcher/native.yaml b/src/madengine/configs/launcher/native.yaml new file mode 100644 index 00000000..248e1b77 --- /dev/null +++ b/src/madengine/configs/launcher/native.yaml @@ -0,0 +1,8 @@ +# @package _global_ +distributed: + enabled: true + launcher: native + backend: nccl + nnodes: 1 + nproc_per_node: 8 + master_port: 29500 diff --git a/src/madengine/configs/launcher/none.yaml b/src/madengine/configs/launcher/none.yaml new file mode 100644 index 00000000..f7e60ebe --- /dev/null +++ b/src/madengine/configs/launcher/none.yaml @@ -0,0 +1,3 @@ +# @package _global_ +distributed: + enabled: false diff --git a/src/madengine/configs/launcher/primus.yaml b/src/madengine/configs/launcher/primus.yaml new file mode 100644 index 00000000..ed548efe --- /dev/null +++ b/src/madengine/configs/launcher/primus.yaml @@ -0,0 +1,8 @@ +# @package _global_ +distributed: + enabled: true + launcher: primus + backend: nccl + nnodes: 1 + nproc_per_node: 8 + master_port: 29500 diff --git a/src/madengine/configs/launcher/sglang.yaml b/src/madengine/configs/launcher/sglang.yaml new file mode 100644 index 00000000..80509f1f --- /dev/null +++ b/src/madengine/configs/launcher/sglang.yaml @@ -0,0 +1,8 @@ +# @package _global_ +distributed: + enabled: true + launcher: sglang + backend: nccl + nnodes: 1 + nproc_per_node: 8 + port: 29500 diff --git a/src/madengine/configs/launcher/sglang_disagg.yaml b/src/madengine/configs/launcher/sglang_disagg.yaml new file mode 100644 index 00000000..77fb212c --- /dev/null +++ b/src/madengine/configs/launcher/sglang_disagg.yaml @@ -0,0 +1,13 @@ +# @package _global_ +distributed: + enabled: true + launcher: sglang-disagg + backend: nccl + nnodes: 3 + nproc_per_node: 8 + port: 29500 + +sglang_disagg: + prefill_nodes: null + decode_nodes: null + transfer_backend: mooncake diff --git a/src/madengine/configs/launcher/torchrun.yaml b/src/madengine/configs/launcher/torchrun.yaml new file mode 100644 index 00000000..4e7798f2 --- /dev/null +++ b/src/madengine/configs/launcher/torchrun.yaml @@ -0,0 +1,9 @@ +# @package _global_ +distributed: + enabled: true + launcher: torchrun + backend: nccl + nnodes: 1 + nproc_per_node: 8 + master_port: 29500 + port: 29500 diff --git a/src/madengine/configs/launcher/torchtitan.yaml b/src/madengine/configs/launcher/torchtitan.yaml new file mode 100644 index 00000000..0a131248 --- /dev/null +++ b/src/madengine/configs/launcher/torchtitan.yaml @@ -0,0 +1,8 @@ +# @package _global_ +distributed: + enabled: true + launcher: torchrun + backend: nccl + nnodes: 1 + nproc_per_node: 8 + master_port: 29500 diff --git a/src/madengine/configs/launcher/vllm.yaml b/src/madengine/configs/launcher/vllm.yaml new file mode 100644 index 00000000..cb38b23a --- /dev/null +++ b/src/madengine/configs/launcher/vllm.yaml @@ -0,0 +1,11 @@ +# @package _global_ +distributed: + enabled: true + launcher: vllm + nnodes: 1 + nproc_per_node: 4 + +vllm: + kv_cache_size: 0.7 + max_model_len: null + tensor_parallel_size: null diff --git a/src/madengine/configs/platform/bare_metal.yaml b/src/madengine/configs/platform/bare_metal.yaml new file mode 100644 index 00000000..09825ab0 --- /dev/null +++ b/src/madengine/configs/platform/bare_metal.yaml @@ -0,0 +1,3 @@ +# @package _global_ +platform: + type: bare_metal diff --git a/src/madengine/configs/platform/docker.yaml b/src/madengine/configs/platform/docker.yaml new file mode 100644 index 00000000..cdc555ec --- /dev/null +++ b/src/madengine/configs/platform/docker.yaml @@ -0,0 +1,3 @@ +# @package _global_ +platform: + type: docker diff --git a/src/madengine/configs/platform/podman.yaml b/src/madengine/configs/platform/podman.yaml new file mode 100644 index 00000000..a1b85147 --- /dev/null +++ b/src/madengine/configs/platform/podman.yaml @@ -0,0 +1,3 @@ +# @package _global_ +platform: + type: podman diff --git a/src/madengine/configs/platform/singularity.yaml b/src/madengine/configs/platform/singularity.yaml new file mode 100644 index 00000000..0cca82a9 --- /dev/null +++ b/src/madengine/configs/platform/singularity.yaml @@ -0,0 +1,3 @@ +# @package _global_ +platform: + type: singularity diff --git a/src/madengine/configs/profile/a100_8gpu.yaml b/src/madengine/configs/profile/a100_8gpu.yaml new file mode 100644 index 00000000..ca5f58a5 --- /dev/null +++ b/src/madengine/configs/profile/a100_8gpu.yaml @@ -0,0 +1,18 @@ +# @package _global_ +gpu_vendor: NVIDIA +guest_os: UBUNTU +gpu_type: a100 +gpu_memory_gb: 80 +gpus_per_node: 8 + +runtime: + devices: [] + capabilities: [] + security_opts: [] + network_mode: host + ipc: host + groups: [] + use_gpu_flag: true + +distributed: + nproc_per_node: 8 diff --git a/src/madengine/configs/profile/h100_8gpu.yaml b/src/madengine/configs/profile/h100_8gpu.yaml new file mode 100644 index 00000000..95095f81 --- /dev/null +++ b/src/madengine/configs/profile/h100_8gpu.yaml @@ -0,0 +1,18 @@ +# @package _global_ +gpu_vendor: NVIDIA +guest_os: UBUNTU +gpu_type: h100 +gpu_memory_gb: 80 +gpus_per_node: 8 + +runtime: + devices: [] + capabilities: [] + security_opts: [] + network_mode: host + ipc: host + groups: [] + use_gpu_flag: true + +distributed: + nproc_per_node: 8 diff --git a/src/madengine/configs/profile/mi250x_4gpu.yaml b/src/madengine/configs/profile/mi250x_4gpu.yaml new file mode 100644 index 00000000..67a580ae --- /dev/null +++ b/src/madengine/configs/profile/mi250x_4gpu.yaml @@ -0,0 +1,11 @@ +# @package _global_ +gpu_type: mi250x +gpu_memory_gb: 128 +gpus_per_node: 4 + +distributed: + nproc_per_node: 4 + +env_vars: + GPU_MAX_HW_QUEUES: "2" + HSA_ENABLE_SDMA: "0" diff --git a/src/madengine/configs/profile/mi300x_8gpu.yaml b/src/madengine/configs/profile/mi300x_8gpu.yaml new file mode 100644 index 00000000..52af4476 --- /dev/null +++ b/src/madengine/configs/profile/mi300x_8gpu.yaml @@ -0,0 +1,12 @@ +# @package _global_ +gpu_type: mi300x +gpu_memory_gb: 192 +gpus_per_node: 8 + +distributed: + nproc_per_node: 8 + +env_vars: + GPU_MAX_HW_QUEUES: "2" + HSA_ENABLE_SDMA: "0" + HSA_FORCE_FINE_GRAIN_PCIE: "1" diff --git a/src/madengine/configs/profile/mi300x_single.yaml b/src/madengine/configs/profile/mi300x_single.yaml new file mode 100644 index 00000000..ee693ac5 --- /dev/null +++ b/src/madengine/configs/profile/mi300x_single.yaml @@ -0,0 +1,7 @@ +# @package _global_ +gpu_type: mi300x +gpu_memory_gb: 192 +gpus_per_node: 1 + +distributed: + nproc_per_node: 1 diff --git a/src/madengine/configs/scheduler/k8s.yaml b/src/madengine/configs/scheduler/k8s.yaml new file mode 100644 index 00000000..6e946ab2 --- /dev/null +++ b/src/madengine/configs/scheduler/k8s.yaml @@ -0,0 +1,31 @@ +# @package _global_ +k8s: + kubeconfig: ~/.kube/config + namespace: default + image_pull_policy: Always + backoff_limit: 3 + ttl_seconds_after_finished: null + allow_privileged_profiling: null + gpu_count: null + gpu_resource_name: amd.com/gpu + memory: null + memory_limit: null + cpu: null + cpu_limit: null + host_ipc: true + node_selector: {} + tolerations: [] + nfs_storage_class: nfs-banff + local_path_storage_class: local-path + data_storage_class: nfs-banff + recreate_shared_data_pvc: false + results_pvc: null + data_pvc: null + output_dir: null + secrets: + strategy: from_local_credentials + image_pull_secret_names: [] + runtime_secret_name: null + +env_vars: + OMP_NUM_THREADS: "8" diff --git a/src/madengine/configs/scheduler/local.yaml b/src/madengine/configs/scheduler/local.yaml new file mode 100644 index 00000000..03bfe3db --- /dev/null +++ b/src/madengine/configs/scheduler/local.yaml @@ -0,0 +1 @@ +# @package _global_ diff --git a/src/madengine/configs/scheduler/slurm.yaml b/src/madengine/configs/scheduler/slurm.yaml new file mode 100644 index 00000000..ad0d6494 --- /dev/null +++ b/src/madengine/configs/scheduler/slurm.yaml @@ -0,0 +1,21 @@ +# @package _global_ +slurm: + partition: amd-rccl + nodes: 1 + gpus_per_node: 8 + time: "24:00:00" + output_dir: ./slurm_results + exclusive: true + modules: [] + account: null + qos: null + constraint: null + nodelist: null + exclude: null + results_dir: null + shared_workspace: null + network_interface: null + +env_vars: + OMP_NUM_THREADS: "8" + MIOPEN_FIND_MODE: "1" diff --git a/src/madengine/configs/tools/power_profiler.yaml b/src/madengine/configs/tools/power_profiler.yaml new file mode 100644 index 00000000..61a5d29c --- /dev/null +++ b/src/madengine/configs/tools/power_profiler.yaml @@ -0,0 +1,9 @@ +# @package _global_ +tools: + - name: gpu_info_power_profiler + env_vars: + POWER_DEVICE: all + POWER_SAMPLING_RATE: "0.1" + POWER_MODE: power + POWER_DUAL_GCD: "false" + POWER_OUTPUT_FILE: gpu_info_power_profiler_output.csv diff --git a/src/madengine/configs/tools/rocm_trace_lite.yaml b/src/madengine/configs/tools/rocm_trace_lite.yaml new file mode 100644 index 00000000..7142ed28 --- /dev/null +++ b/src/madengine/configs/tools/rocm_trace_lite.yaml @@ -0,0 +1,5 @@ +# @package _global_ +tools: + - name: rocm_trace_lite + env_vars: + RTL_MODE: lite diff --git a/src/madengine/configs/tools/rocprofv3_comprehensive.yaml b/src/madengine/configs/tools/rocprofv3_comprehensive.yaml new file mode 100644 index 00000000..001cc4c8 --- /dev/null +++ b/src/madengine/configs/tools/rocprofv3_comprehensive.yaml @@ -0,0 +1,17 @@ +# @package _global_ +tools: + - name: rocprofv3_full + env_vars: + RCCL_DEBUG: INFO + HSA_ENABLE_SDMA: "0" + - name: gpu_info_power_profiler + env_vars: + POWER_DEVICE: all + POWER_SAMPLING_RATE: "0.1" + POWER_DUAL_GCD: "false" + - name: gpu_info_vram_profiler + env_vars: + VRAM_DEVICE: all + VRAM_SAMPLING_RATE: "0.1" + - name: miopen_trace + - name: rocblas_trace diff --git a/src/madengine/configs/tools/rocprofv3_lightweight.yaml b/src/madengine/configs/tools/rocprofv3_lightweight.yaml new file mode 100644 index 00000000..7064316f --- /dev/null +++ b/src/madengine/configs/tools/rocprofv3_lightweight.yaml @@ -0,0 +1,3 @@ +# @package _global_ +tools: + - name: rocprofv3_lightweight diff --git a/src/madengine/configs/tools/vram_profiler.yaml b/src/madengine/configs/tools/vram_profiler.yaml new file mode 100644 index 00000000..c53c6f70 --- /dev/null +++ b/src/madengine/configs/tools/vram_profiler.yaml @@ -0,0 +1,9 @@ +# @package _global_ +tools: + - name: gpu_info_vram_profiler + env_vars: + VRAM_DEVICE: all + VRAM_SAMPLING_RATE: "0.1" + VRAM_MODE: vram + VRAM_DUAL_GCD: "false" + VRAM_OUTPUT_FILE: gpu_info_vram_profiler_output.csv diff --git a/src/madengine/core/auth.py b/src/madengine/core/auth.py index 15f0a0a6..48e1cd0a 100644 --- a/src/madengine/core/auth.py +++ b/src/madengine/core/auth.py @@ -13,11 +13,7 @@ import shlex from typing import Dict, Optional -from madengine.core.errors import ( - ConfigurationError, - create_error_context, - handle_error, -) +from madengine.core.errors import ConfigurationError, create_error_context, handle_error def load_credentials() -> Optional[Dict]: @@ -40,7 +36,10 @@ def load_credentials() -> Optional[Dict]: with open(credential_file) as f: loaded = json.load(f) if not isinstance(loaded, dict): - raise ValueError("credential.json must contain a JSON object, not " + type(loaded).__name__) + raise ValueError( + "credential.json must contain a JSON object, not " + + type(loaded).__name__ + ) credentials = loaded print( f"Loaded credentials from {credential_file}: " @@ -163,7 +162,7 @@ def login_to_registry( # Pass the password via an environment variable so it never appears in # the process argument list (visible via /proc or ps to other users). quoted_username = shlex.quote(username) - login_command = "printf %s \"$MAD_REGISTRY_PASSWORD\" | docker login" + login_command = 'printf %s "$MAD_REGISTRY_PASSWORD" | docker login' if registry and registry.lower() not in ["docker.io", "dockerhub"]: login_command += f" {shlex.quote(str(registry))}" login_command += f" --username {quoted_username} --password-stdin" @@ -177,8 +176,6 @@ def login_to_registry( f"{registry or 'DockerHub'}[/green]" ) except Exception as e: - rich_console.print( - f"[red]Failed to login to registry {registry}: {e}[/red]" - ) + rich_console.print(f"[red]Failed to login to registry {registry}: {e}[/red]") if raise_on_failure: raise diff --git a/src/madengine/core/console.py b/src/madengine/core/console.py index 57d7b329..d89488b0 100644 --- a/src/madengine/core/console.py +++ b/src/madengine/core/console.py @@ -5,10 +5,11 @@ Copyright (c) Advanced Micro Devices, Inc. All rights reserved. """ +import re + # built-in modules import subprocess import typing -import re class Console: diff --git a/src/madengine/core/constants.py b/src/madengine/core/constants.py index d1afa4c9..f4cf38cb 100644 --- a/src/madengine/core/constants.py +++ b/src/madengine/core/constants.py @@ -22,11 +22,12 @@ Copyright (c) Advanced Micro Devices, Inc. All rights reserved. """ -# built-in modules -import os import json import logging +# built-in modules +import os + # Utility function for optional verbose logging of configuration def _log_config_info(message: str, force_print: bool = False): @@ -46,14 +47,14 @@ def _log_config_info(message: str, force_print: bool = False): def _setup_model_dir(): """Setup model directory if MODEL_DIR environment variable is set. - + MODEL_DIR defaults to "." (current directory) if not set. Only copies if MODEL_DIR points to a different directory than current working directory. """ # Get absolute paths to compare model_dir_abs = os.path.abspath(MODEL_DIR) cwd_abs = os.path.abspath(".") - + # Only copy if MODEL_DIR points to a different directory (not current dir) if model_dir_abs != cwd_abs: # Copy MODEL_DIR to the current working directory. diff --git a/src/madengine/core/context.py b/src/madengine/core/context.py index fb934483..dc598291 100644 --- a/src/madengine/core/context.py +++ b/src/madengine/core/context.py @@ -23,7 +23,11 @@ # third-party modules from madengine.core.console import Console from madengine.utils.rocm_path_resolver import resolve_host_rocm_path -from madengine.utils.gpu_validator import validate_rocm_installation, GPUInstallationError, GPUVendor +from madengine.utils.gpu_validator import ( + validate_rocm_installation, + GPUInstallationError, + GPUVendor, +) from madengine.utils.gpu_tool_factory import get_gpu_tool_manager from madengine.utils.gpu_tool_manager import BaseGPUToolManager @@ -180,10 +184,14 @@ def init_build_context(self, detect_gpu_arch: bool = False) -> None: # Optionally auto-detect GPU architecture for local full-workflow builds (build+run). # Skipped for standalone `madengine build` on non-GPU/CI nodes (detect_gpu_arch=False). - if detect_gpu_arch and "MAD_SYSTEM_GPU_ARCHITECTURE" not in self.ctx.get("docker_build_arg", {}): + if detect_gpu_arch and "MAD_SYSTEM_GPU_ARCHITECTURE" not in self.ctx.get( + "docker_build_arg", {} + ): try: from madengine.utils.gpu_validator import detect_gpu_vendor - from madengine.execution.dockerfile_utils import normalize_architecture_name + from madengine.execution.dockerfile_utils import ( + normalize_architecture_name, + ) vendor = detect_gpu_vendor(self._rocm_path) if vendor in (GPUVendor.AMD, GPUVendor.NVIDIA): @@ -193,11 +201,17 @@ def init_build_context(self, detect_gpu_arch: bool = False) -> None: self.ctx["docker_build_arg"]["MAD_SYSTEM_GPU_ARCHITECTURE"] = arch print(f"Auto-detected GPU architecture for build: {arch}") else: - print("Warning: No supported GPU detected; MAD_SYSTEM_GPU_ARCHITECTURE will not be set automatically.") - print("Consider providing it via --additional-context if needed for build args.") + print( + "Warning: No supported GPU detected; MAD_SYSTEM_GPU_ARCHITECTURE will not be set automatically." + ) + print( + "Consider providing it via --additional-context if needed for build args." + ) except Exception as e: print(f"Warning: Could not auto-detect GPU architecture for build: {e}") - print("Consider providing MAD_SYSTEM_GPU_ARCHITECTURE via --additional-context if needed for build args.") + print( + "Consider providing MAD_SYSTEM_GPU_ARCHITECTURE via --additional-context if needed for build args." + ) # Don't initialize NUMA balancing check for build-only nodes # This is runtime-specific and should be handled on execution nodes @@ -352,10 +366,10 @@ def ensure_system_context(self) -> None: def _get_tool_manager(self) -> BaseGPUToolManager: """Get GPU tool manager for the current vendor (lazy initialization). - + Returns: GPU tool manager instance - + Raises: ValueError: If GPU vendor cannot be determined or is unsupported """ @@ -371,9 +385,11 @@ def _get_tool_manager(self) -> BaseGPUToolManager: vendor = None # Auto-detect else: vendor = None # Auto-detect - - self._gpu_tool_manager = get_gpu_tool_manager(vendor, rocm_path=self._rocm_path) - + + self._gpu_tool_manager = get_gpu_tool_manager( + vendor, rocm_path=self._rocm_path + ) + return self._gpu_tool_manager def get_ctx_test(self) -> str: @@ -403,19 +419,22 @@ def get_gpu_vendor(self) -> str: What types of GPU vendors are supported? - NVIDIA - AMD - + PR #54 Enhancement: Added fallback to rocm-smi if amd-smi is missing. """ # Check NVIDIA first (simplest check) if os.path.exists("/usr/bin/nvidia-smi"): try: - result = self.console.sh("/usr/bin/nvidia-smi > /dev/null 2>&1 && echo 'NVIDIA' || echo ''", timeout=180) + result = self.console.sh( + "/usr/bin/nvidia-smi > /dev/null 2>&1 && echo 'NVIDIA' || echo ''", + timeout=180, + ) if result and result.strip() == "NVIDIA": return "NVIDIA" except Exception as e: print(f"Warning: nvidia-smi check failed: {e}") - + # Check AMD - try amd-smi first, fallback to rocm-smi (PR #54) # Use configurable ROCm path (MAD_ROCM_PATH / ROCM_PATH) for non-default installs amd_smi_paths = [ @@ -426,22 +445,28 @@ def get_gpu_vendor(self) -> str: if os.path.exists(amd_smi_path): try: # Verify amd-smi actually works (180s timeout for slow GPU initialization) - result = self.console.sh(f"{amd_smi_path} list > /dev/null 2>&1 && echo 'AMD' || echo ''", timeout=180) + result = self.console.sh( + f"{amd_smi_path} list > /dev/null 2>&1 && echo 'AMD' || echo ''", + timeout=180, + ) if result and result.strip() == "AMD": return "AMD" except Exception as e: print(f"Warning: amd-smi check failed for {amd_smi_path}: {e}") - + # Fallback to rocm-smi (PR #54) rocm_smi_path = os.path.join(self._rocm_path, "bin", "rocm-smi") if os.path.exists(rocm_smi_path): try: - result = self.console.sh(f"{rocm_smi_path} --showid > /dev/null 2>&1 && echo 'AMD' || echo ''", timeout=180) + result = self.console.sh( + f"{rocm_smi_path} --showid > /dev/null 2>&1 && echo 'AMD' || echo ''", + timeout=180, + ) if result and result.strip() == "AMD": return "AMD" except Exception as e: print(f"Warning: rocm-smi check failed: {e}") - + return "Unable to detect GPU vendor" def get_host_os(self) -> str: @@ -500,20 +525,19 @@ def get_system_ngpus(self) -> int: What types of GPU vendors are supported? - NVIDIA - AMD - + Enhancement: Uses version-aware tool manager with automatic fallback (PR #54). """ vendor = self.ctx["docker_env_vars"]["MAD_GPU_VENDOR"] - + if vendor == "AMD": try: tool_manager = self._get_tool_manager() return tool_manager.get_gpu_count() except Exception as e: raise RuntimeError( - f"Unable to determine number of AMD GPUs. " - f"Error: {e}" + f"Unable to determine number of AMD GPUs. " f"Error: {e}" ) elif vendor == "NVIDIA": try: @@ -522,12 +546,13 @@ def get_system_ngpus(self) -> int: except Exception as e: # Fallback to direct command for NVIDIA (longer timeout for slow compute nodes) try: - number_gpus = int(self.console.sh("nvidia-smi -L | wc -l", timeout=180)) + number_gpus = int( + self.console.sh("nvidia-smi -L | wc -l", timeout=180) + ) return number_gpus except Exception: raise RuntimeError( - f"Unable to determine number of NVIDIA GPUs. " - f"Error: {e}" + f"Unable to determine number of NVIDIA GPUs. " f"Error: {e}" ) else: raise RuntimeError(f"Unable to determine gpu vendor: {vendor}") @@ -569,32 +594,31 @@ def get_system_gpu_architecture(self) -> str: def get_system_gpu_product_name(self) -> str: """Get system GPU product name with fallback (PR #54). - + Returns: str: The GPU product name (e.g., AMD Instinct MI300X, NVIDIA H100 80GB HBM3). - + Raises: RuntimeError: If the GPU vendor is not detected. RuntimeError: If the GPU product name is unable to determine. - + Note: What types of GPU vendors are supported? - NVIDIA - AMD - + PR #54 Enhancement: Added rocm-smi fallback for AMD GPUs when amd-smi unavailable. """ vendor = self.ctx["docker_env_vars"]["MAD_GPU_VENDOR"] - + if vendor == "AMD": try: tool_manager = self._get_tool_manager() return tool_manager.get_gpu_product_name(gpu_id=0) except Exception as e: raise RuntimeError( - f"Unable to determine AMD GPU product name. " - f"Error: {e}" + f"Unable to determine AMD GPU product name. " f"Error: {e}" ) elif vendor == "NVIDIA": try: @@ -603,58 +627,66 @@ def get_system_gpu_product_name(self) -> str: except Exception as e: # Fallback to direct command for NVIDIA (longer timeout for slow compute nodes) try: - return self.console.sh("nvidia-smi --query-gpu=name --format=csv,noheader,nounits -i 0", timeout=180) + return self.console.sh( + "nvidia-smi --query-gpu=name --format=csv,noheader,nounits -i 0", + timeout=180, + ) except Exception: raise RuntimeError( - f"Unable to determine NVIDIA GPU product name. " - f"Error: {e}" + f"Unable to determine NVIDIA GPU product name. " f"Error: {e}" ) else: - raise RuntimeError(f"Unable to determine gpu product name for vendor: {vendor}") + raise RuntimeError( + f"Unable to determine gpu product name for vendor: {vendor}" + ) def get_system_hip_version(self): """Get HIP/CUDA version using tool manager. - + Returns: str: Version string (e.g., "6.4" for ROCm, "12.0" for CUDA) - + Raises: RuntimeError: If version cannot be determined - + Enhancement: Uses tool manager for robust version detection with multiple fallbacks. """ - vendor = self.ctx['docker_env_vars']['MAD_GPU_VENDOR'] - - if vendor == 'AMD': + vendor = self.ctx["docker_env_vars"]["MAD_GPU_VENDOR"] + + if vendor == "AMD": try: tool_manager = self._get_tool_manager() version_str = tool_manager.get_version() if version_str: # Return major.minor only (e.g., "6.4.1" -> "6.4") - parts = version_str.split('.') + parts = version_str.split(".") if len(parts) >= 2: return f"{parts[0]}.{parts[1]}" return version_str - + # Fallback to hipconfig if tool manager fails version = self.console.sh("hipconfig --version | cut -d'.' -f1,2") if not version or version.strip() == "": raise RuntimeError("hipconfig returned empty version") return version - + except Exception as e: raise RuntimeError( f"Unable to determine HIP version. " f"Ensure ROCm is installed and hipconfig is accessible. " f"Error: {e}" ) - elif vendor == 'NVIDIA': + elif vendor == "NVIDIA": try: tool_manager = self._get_tool_manager() - return tool_manager.get_version() or self.console.sh("nvcc --version | sed -n 's/^.*release \\([0-9]\\+\\.[0-9]\\+\\).*$/\\1/p'") + return tool_manager.get_version() or self.console.sh( + "nvcc --version | sed -n 's/^.*release \\([0-9]\\+\\.[0-9]\\+\\).*$/\\1/p'" + ) except Exception: - return self.console.sh("nvcc --version | sed -n 's/^.*release \\([0-9]\\+\\.[0-9]\\+\\).*$/\\1/p'") + return self.console.sh( + "nvcc --version | sed -n 's/^.*release \\([0-9]\\+\\.[0-9]\\+\\).*$/\\1/p'" + ) else: raise RuntimeError(f"Unable to determine hip version for vendor: {vendor}") @@ -692,11 +724,11 @@ def get_gpu_renderD_nodes(self) -> typing.Optional[typing.List[int]]: """ # Initialize the GPU renderD nodes. gpu_renderDs = None - + # Check if the GPU vendor is AMD. - if self.ctx['docker_env_vars']['MAD_GPU_VENDOR'] != 'AMD': + if self.ctx["docker_env_vars"]["MAD_GPU_VENDOR"] != "AMD": return gpu_renderDs - + try: # Get ROCm version using tool manager for robust detection (PR #54) try: @@ -707,45 +739,62 @@ def get_gpu_renderD_nodes(self) -> typing.Optional[typing.List[int]]: except Exception as e: # Fallback to direct file read version_file = os.path.join(self._rocm_path, ".info", "version") - rocm_version_str = self.console.sh(f"cat {version_file} | cut -d'-' -f1") + rocm_version_str = self.console.sh( + f"cat {version_file} | cut -d'-' -f1" + ) if not rocm_version_str or rocm_version_str.strip() == "": - raise RuntimeError(f"Failed to retrieve ROCm version from {version_file}") - + raise RuntimeError( + f"Failed to retrieve ROCm version from {version_file}" + ) + # Parse version safely try: rocm_version = tuple(map(int, rocm_version_str.strip().split("."))) except (ValueError, AttributeError) as parse_err: - raise RuntimeError(f"Failed to parse ROCm version '{rocm_version_str}': {parse_err}") - + raise RuntimeError( + f"Failed to parse ROCm version '{rocm_version_str}': {parse_err}" + ) + # Get renderDs from KFD properties # Try KFD topology first (preferred), but gracefully handle permission errors # On HPC/multi-user systems, KFD topology files may be restricted kfd_renderDs = None kfd_properties = [] try: - kfd_output = self.console.sh("grep -r drm_render_minor /sys/devices/virtual/kfd/kfd/topology/nodes") + kfd_output = self.console.sh( + "grep -r drm_render_minor /sys/devices/virtual/kfd/kfd/topology/nodes" + ) if kfd_output and kfd_output.strip(): kfd_properties = kfd_output.split("\n") # Filter out empty lines and CPU entries (renderD value 0) kfd_properties = [ - line for line in kfd_properties + line + for line in kfd_properties if line.strip() and line.split() and int(line.split()[-1]) != 0 ] if kfd_properties: - kfd_renderDs = [int(line.split()[-1]) for line in kfd_properties] + kfd_renderDs = [ + int(line.split()[-1]) for line in kfd_properties + ] except Exception as kfd_error: # KFD topology read failed (common on HPC clusters with restricted permissions) # Will use amd-smi/rocm-smi fallback which provides renderD info directly - print(f"Note: KFD topology not accessible ({kfd_error}), using ROCm tools fallback") + print( + f"Note: KFD topology not accessible ({kfd_error}), using ROCm tools fallback" + ) # Get gpu id - renderD mapping using unique id if ROCm < 6.4.1 and node id otherwise # node id is more robust but is only available from 6.4.1 (PR #54) if rocm_version < (6, 4, 1): # Legacy method using unique_id - kfd_unique_output = self.console.sh("grep -r unique_id /sys/devices/virtual/kfd/kfd/topology/nodes") + kfd_unique_output = self.console.sh( + "grep -r unique_id /sys/devices/virtual/kfd/kfd/topology/nodes" + ) if not kfd_unique_output: - raise RuntimeError("Failed to retrieve unique_id from KFD properties") - + raise RuntimeError( + "Failed to retrieve unique_id from KFD properties" + ) + kfd_unique_ids_raw = kfd_unique_output.split("\n") # Convert unique_ids to hex, filtering empty lines kfd_unique_ids = [] @@ -755,7 +804,9 @@ def get_gpu_renderD_nodes(self) -> typing.Optional[typing.List[int]]: unique_id_int = int(item.split()[-1]) kfd_unique_ids.append(hex(unique_id_int)) except (ValueError, IndexError) as e: - print(f"Warning: Failed to parse unique_id from line '{item}': {e}") + print( + f"Warning: Failed to parse unique_id from line '{item}': {e}" + ) continue if len(kfd_unique_ids) != len(kfd_renderDs): @@ -766,54 +817,67 @@ def get_gpu_renderD_nodes(self) -> typing.Optional[typing.List[int]]: # Map unique ids to renderDs uniqueid_renderD_map = { - unique_id: renderD + unique_id: renderD for unique_id, renderD in zip(kfd_unique_ids, kfd_renderDs) } # Get GPU ID to unique ID mapping from rocm-smi (longer timeout for slow compute nodes) - rsmi_output = self.console.sh("rocm-smi --showuniqueid | grep 'Unique.*:'", timeout=180) + rsmi_output = self.console.sh( + "rocm-smi --showuniqueid | grep 'Unique.*:'", timeout=180 + ) if not rsmi_output or rsmi_output.strip() == "": raise RuntimeError("Failed to retrieve unique IDs from rocm-smi") - - rsmi_lines = [line.strip() for line in rsmi_output.split("\n") if line.strip()] - + + rsmi_lines = [ + line.strip() for line in rsmi_output.split("\n") if line.strip() + ] + # Sort gpu_renderDs based on GPU IDs gpu_renderDs = [] for line in rsmi_lines: try: unique_id = line.split()[-1] if unique_id not in uniqueid_renderD_map: - raise KeyError(f"Unique ID '{unique_id}' from rocm-smi not found in KFD mapping") + raise KeyError( + f"Unique ID '{unique_id}' from rocm-smi not found in KFD mapping" + ) gpu_renderDs.append(uniqueid_renderD_map[unique_id]) except (IndexError, KeyError) as e: - raise RuntimeError(f"Failed to map unique ID from line '{line}': {e}") + raise RuntimeError( + f"Failed to map unique ID from line '{line}': {e}" + ) else: # Modern method using amd-smi (ROCm >= 6.4.0) # Get list of GPUs from amd-smi (redirect stderr to filter warnings) # Longer timeout (180s) for slow GPU initialization on SLURM compute nodes - output = self.console.sh("amd-smi list -e --json 2>/dev/null || amd-smi list -e --json 2>&1", timeout=180) + output = self.console.sh( + "amd-smi list -e --json 2>/dev/null || amd-smi list -e --json 2>&1", + timeout=180, + ) if not output or output.strip() == "": raise ValueError("Failed to retrieve AMD GPU data from amd-smi") - + # amd-smi may output warnings before JSON - extract only JSON part # Look for lines starting with '[' or '{' (JSON start) json_start = -1 - lines = output.split('\n') + lines = output.split("\n") for i, line in enumerate(lines): - if line.strip().startswith('[') or line.strip().startswith('{'): + if line.strip().startswith("[") or line.strip().startswith("{"): json_start = i break - + if json_start >= 0: - json_output = '\n'.join(lines[json_start:]) + json_output = "\n".join(lines[json_start:]) else: json_output = output - + try: data = json.loads(json_output) except json.JSONDecodeError as e: - raise ValueError(f"Failed to parse amd-smi JSON output: {e}. Output was: {output[:200]}") - + raise ValueError( + f"Failed to parse amd-smi JSON output: {e}. Output was: {output[:200]}" + ) + if not data or not isinstance(data, list): raise ValueError("amd-smi returned empty or invalid data") @@ -827,9 +891,13 @@ def get_gpu_renderD_nodes(self) -> typing.Optional[typing.List[int]]: if match: kfd_nodeids.append(int(match.group())) else: - print(f"Warning: Could not extract node ID from line: {line}") + print( + f"Warning: Could not extract node ID from line: {line}" + ) except (IndexError, ValueError) as e: - print(f"Warning: Failed to parse node ID from line '{line}': {e}") + print( + f"Warning: Failed to parse node ID from line '{line}': {e}" + ) continue if len(kfd_nodeids) != len(kfd_renderDs): @@ -840,7 +908,7 @@ def get_gpu_renderD_nodes(self) -> typing.Optional[typing.List[int]]: # Map node ids to renderDs nodeid_renderD_map = { - nodeid: renderD + nodeid: renderD for nodeid, renderD in zip(kfd_nodeids, kfd_renderDs) } @@ -850,12 +918,14 @@ def get_gpu_renderD_nodes(self) -> typing.Optional[typing.List[int]]: try: gpuid_nodeid_map[item["gpu"]] = item["node_id"] except KeyError as e: - raise KeyError(f"Failed to parse node_id from amd-smi data: {e}. Item: {item}") + raise KeyError( + f"Failed to parse node_id from amd-smi data: {e}. Item: {item}" + ) # Sort gpu_renderDs based on gpu ids try: gpu_renderDs = [ - nodeid_renderD_map[gpuid_nodeid_map[gpuid]] + nodeid_renderD_map[gpuid_nodeid_map[gpuid]] for gpuid in sorted(gpuid_nodeid_map.keys()) ] except KeyError as e: @@ -871,7 +941,9 @@ def get_gpu_renderD_nodes(self) -> typing.Optional[typing.List[int]]: render_num = int(render_str.replace("renderD", "")) gpu_renderDs.append(render_num) except (KeyError, ValueError) as e: - raise RuntimeError(f"Failed to parse renderD from amd-smi: {e}. Item: {item}") + raise RuntimeError( + f"Failed to parse renderD from amd-smi: {e}. Item: {item}" + ) except (RuntimeError, ValueError, KeyError) as e: # Re-raise with context diff --git a/src/madengine/core/dataprovider.py b/src/madengine/core/dataprovider.py index 809c4425..62914683 100644 --- a/src/madengine/core/dataprovider.py +++ b/src/madengine/core/dataprovider.py @@ -26,9 +26,9 @@ # madengine modules from madengine.core.console import Console +from madengine.core.constants import MAD_AWS_S3, MAD_MINIO, NAS_NODES from madengine.core.context import Context from madengine.core.docker import Docker -from madengine.core.constants import NAS_NODES, MAD_AWS_S3, MAD_MINIO class DataSourceException(Exception): @@ -313,11 +313,11 @@ def prepare_data(self, model_docker): if "mirrorlocal" in self.config: # copy data from NAS locally cmd = """ - if [ -f \"$(which apt)\" ]; then + if [ -f \"$(which apt)\" ]; then apt update && apt install -y sshpass sshfs rsync - elif [ -f \"$(which yum)\" ]; then + elif [ -f \"$(which yum)\" ]; then yum install -y sshpass rsync - else + else echo 'Unable to detect Host OS' exit 1 fi @@ -350,11 +350,11 @@ def prepare_data(self, model_docker): print("Data Download Duration: {} seconds".format(self.duration)) else: cmd = """ - if [ -f \"$(which apt)\" ]; then + if [ -f \"$(which apt)\" ]; then apt update && apt install -y sshpass sshfs - elif [ -f \"$(which yum)\" ]; then + elif [ -f \"$(which yum)\" ]; then yum install -y sshpass sshfs - else + else echo 'Unable to detect Host OS' exit 1 fi diff --git a/src/madengine/core/docker.py b/src/madengine/core/docker.py index 115b9448..c15d5eff 100644 --- a/src/madengine/core/docker.py +++ b/src/madengine/core/docker.py @@ -68,17 +68,13 @@ def __init__( ) # if container name exists, clean it up automatically if container_name_exists: - print( - f"⚠️ Container '{container_name}' already exists. Cleaning up..." - ) + print(f"⚠️ Container '{container_name}' already exists. Cleaning up...") # Stop the container (with timeout) self.console.sh( f"docker stop -t 1 {container_name_quoted} 2>/dev/null || true" ) # Remove the container - self.console.sh( - f"docker rm -f {container_name_quoted} 2>/dev/null || true" - ) + self.console.sh(f"docker rm -f {container_name_quoted} 2>/dev/null || true") print(f"✓ Cleaned up existing container '{container_name}'") # run docker command @@ -107,7 +103,7 @@ def __init__( command += "--workdir /myworkspace/ " command += "--name " + container_name + " " command += image + " " - + # Use 'cat' to keep container alive (blocks waiting for stdin) # Works reliably across all deployment types (local, k8s, slurm) # with fresh image pulls preventing corrupted layer issues @@ -116,9 +112,7 @@ def __init__( # find container sha — use the same exact-match filter as the existence # check above to avoid false positives from substring/regex matches. - self.docker_sha = self.console.sh( - f"docker ps -aqf name={container_name_regex}" - ) + self.docker_sha = self.console.sh(f"docker ps -aqf name={container_name_regex}") def sh(self, command: str, timeout: int = 60, secret: bool = False) -> str: """Run shell command inside docker. diff --git a/src/madengine/core/errors.py b/src/madengine/core/errors.py index 6a0757ab..168a0306 100644 --- a/src/madengine/core/errors.py +++ b/src/madengine/core/errors.py @@ -8,15 +8,17 @@ import logging from dataclasses import dataclass -from typing import Optional, Any, Dict, List from enum import Enum +from typing import Any, Dict, List, Optional try: from rich.console import Console from rich.panel import Panel from rich.text import Text except ImportError: - raise ImportError("Rich is required for error handling. Install with: pip install rich") + raise ImportError( + "Rich is required for error handling. Install with: pip install rich" + ) class ErrorCategory(Enum): @@ -37,7 +39,7 @@ class ErrorCategory(Enum): @dataclass class ErrorContext: """Context information for errors.""" - + operation: str phase: Optional[str] = None component: Optional[str] = None @@ -49,7 +51,7 @@ class ErrorContext: class MADEngineError(Exception): """Base exception for all madengine errors.""" - + def __init__( self, message: str, @@ -57,7 +59,7 @@ def __init__( context: Optional[ErrorContext] = None, cause: Optional[Exception] = None, recoverable: bool = False, - suggestions: Optional[List[str]] = None + suggestions: Optional[List[str]] = None, ): super().__init__(message) self.message = message @@ -73,11 +75,7 @@ class ValidationError(MADEngineError): def __init__(self, message: str, context: Optional[ErrorContext] = None, **kwargs): super().__init__( - message, - ErrorCategory.VALIDATION, - context, - recoverable=True, - **kwargs + message, ErrorCategory.VALIDATION, context, recoverable=True, **kwargs ) @@ -86,11 +84,7 @@ class NetworkError(MADEngineError): def __init__(self, message: str, context: Optional[ErrorContext] = None, **kwargs): super().__init__( - message, - ErrorCategory.CONNECTION, - context, - recoverable=True, - **kwargs + message, ErrorCategory.CONNECTION, context, recoverable=True, **kwargs ) @@ -99,11 +93,7 @@ class AuthenticationError(MADEngineError): def __init__(self, message: str, context: Optional[ErrorContext] = None, **kwargs): super().__init__( - message, - ErrorCategory.AUTHENTICATION, - context, - recoverable=True, - **kwargs + message, ErrorCategory.AUTHENTICATION, context, recoverable=True, **kwargs ) @@ -112,37 +102,25 @@ class ExecutionError(MADEngineError): def __init__(self, message: str, context: Optional[ErrorContext] = None, **kwargs): super().__init__( - message, - ErrorCategory.RUNTIME, - context, - recoverable=False, - **kwargs + message, ErrorCategory.RUNTIME, context, recoverable=False, **kwargs ) class BuildError(MADEngineError): """Build and compilation errors.""" - + def __init__(self, message: str, context: Optional[ErrorContext] = None, **kwargs): super().__init__( - message, - ErrorCategory.BUILD, - context, - recoverable=False, - **kwargs + message, ErrorCategory.BUILD, context, recoverable=False, **kwargs ) class DiscoveryError(MADEngineError): """Model discovery errors.""" - + def __init__(self, message: str, context: Optional[ErrorContext] = None, **kwargs): super().__init__( - message, - ErrorCategory.DISCOVERY, - context, - recoverable=True, - **kwargs + message, ErrorCategory.DISCOVERY, context, recoverable=True, **kwargs ) @@ -151,11 +129,7 @@ class OrchestrationError(MADEngineError): def __init__(self, message: str, context: Optional[ErrorContext] = None, **kwargs): super().__init__( - message, - ErrorCategory.ORCHESTRATION, - context, - recoverable=False, - **kwargs + message, ErrorCategory.ORCHESTRATION, context, recoverable=False, **kwargs ) @@ -164,11 +138,7 @@ class RunnerError(MADEngineError): def __init__(self, message: str, context: Optional[ErrorContext] = None, **kwargs): super().__init__( - message, - ErrorCategory.RUNNER, - context, - recoverable=True, - **kwargs + message, ErrorCategory.RUNNER, context, recoverable=True, **kwargs ) @@ -177,11 +147,7 @@ class ConfigurationError(MADEngineError): def __init__(self, message: str, context: Optional[ErrorContext] = None, **kwargs): super().__init__( - message, - ErrorCategory.CONFIGURATION, - context, - recoverable=True, - **kwargs + message, ErrorCategory.CONFIGURATION, context, recoverable=True, **kwargs ) @@ -190,40 +156,38 @@ class DeploymentTimeoutError(MADEngineError): def __init__(self, message: str, context: Optional[ErrorContext] = None, **kwargs): super().__init__( - message, - ErrorCategory.TIMEOUT, - context, - recoverable=True, - **kwargs + message, ErrorCategory.TIMEOUT, context, recoverable=True, **kwargs ) class ErrorHandler: """Unified error handler with Rich console integration.""" - + def __init__(self, console: Optional[Console] = None, verbose: bool = False): self.console = console or Console() self.verbose = verbose self.logger = logging.getLogger(__name__) - + def handle_error( - self, - error: Exception, + self, + error: Exception, context: Optional[ErrorContext] = None, - show_traceback: Optional[bool] = None + show_traceback: Optional[bool] = None, ) -> None: """Handle and display errors with rich formatting.""" - + show_tb = show_traceback if show_traceback is not None else self.verbose - + if isinstance(error, MADEngineError): self._handle_madengine_error(error, show_tb) else: self._handle_generic_error(error, context, show_tb) - - def _handle_madengine_error(self, error: MADEngineError, show_traceback: bool) -> None: + + def _handle_madengine_error( + self, error: MADEngineError, show_traceback: bool + ) -> None: """Handle madengine structured errors.""" - + # Determine error emoji and color category_info = { ErrorCategory.VALIDATION: ("⚠️", "yellow"), @@ -237,16 +201,16 @@ def _handle_madengine_error(self, error: MADEngineError, show_traceback: bool) - ErrorCategory.CONFIGURATION: ("⚙️", "yellow"), ErrorCategory.TIMEOUT: ("⏱️", "yellow"), } - + emoji, color = category_info.get(error.category, ("❌", "red")) - + # Create error panel title = f"{emoji} {error.category.value.title()} Error" - + # Build error content content = Text() content.append(f"{error.message}\n", style=f"bold {color}") - + # Add context information if error.context: content.append("\n📋 Context:\n", style="bold cyan") @@ -262,58 +226,50 @@ def _handle_madengine_error(self, error: MADEngineError, show_traceback: bool) - content.append(f" Node: {error.context.node_id}\n") if error.context.file_path: content.append(f" File: {error.context.file_path}\n") - + # Add cause information if error.cause: content.append(f"\n🔗 Caused by: {str(error.cause)}\n", style="dim") - + # Add suggestions if error.suggestions: content.append("\n💡 Suggestions:\n", style="bold green") for suggestion in error.suggestions: content.append(f" • {suggestion}\n", style="green") - + # Add recovery information if error.recoverable: content.append("\n♻️ This error may be recoverable", style="bold blue") - - panel = Panel( - content, - title=title, - border_style=color, - expand=False - ) - + + panel = Panel(content, title=title, border_style=color, expand=False) + self.console.print(panel) - + # Show traceback if requested if show_traceback and error.cause: self.console.print("\n📚 [bold]Full Traceback:[/bold]") self.console.print_exception() - + # Log to file self.logger.error( f"{error.category.value}: {error.message}", extra={ "context": error.context.__dict__ if error.context else {}, "recoverable": error.recoverable, - "suggestions": error.suggestions - } + "suggestions": error.suggestions, + }, ) - + def _handle_generic_error( - self, - error: Exception, - context: Optional[ErrorContext], - show_traceback: bool + self, error: Exception, context: Optional[ErrorContext], show_traceback: bool ) -> None: """Handle generic Python exceptions.""" - + title = f"❌ {type(error).__name__}" - + content = Text() content.append(f"{str(error)}\n", style="bold red") - + if context: content.append("\n📋 Context:\n", style="bold cyan") content.append(f" Operation: {context.operation}\n") @@ -321,20 +277,15 @@ def _handle_generic_error( content.append(f" Phase: {context.phase}\n") if context.component: content.append(f" Component: {context.component}\n") - - panel = Panel( - content, - title=title, - border_style="red", - expand=False - ) - + + panel = Panel(content, title=title, border_style="red", expand=False) + self.console.print(panel) - + if show_traceback: self.console.print("\n📚 [bold]Full Traceback:[/bold]") self.console.print_exception() - + # Log to file self.logger.error(f"{type(error).__name__}: {str(error)}") @@ -355,9 +306,9 @@ def get_error_handler() -> Optional[ErrorHandler]: def handle_error( - error: Exception, + error: Exception, context: Optional[ErrorContext] = None, - show_traceback: Optional[bool] = None + show_traceback: Optional[bool] = None, ) -> None: """Handle error using the global error handler.""" if _global_error_handler: @@ -373,12 +324,7 @@ def create_error_context( operation: str, phase: Optional[str] = None, component: Optional[str] = None, - **kwargs + **kwargs, ) -> ErrorContext: """Convenience function to create error context.""" - return ErrorContext( - operation=operation, - phase=phase, - component=component, - **kwargs - ) + return ErrorContext(operation=operation, phase=phase, component=component, **kwargs) diff --git a/src/madengine/database/__init__.py b/src/madengine/database/__init__.py index 89c630c0..141bb03b 100644 --- a/src/madengine/database/__init__.py +++ b/src/madengine/database/__init__.py @@ -6,12 +6,12 @@ """ from .mongodb import ( - MongoDBHandler, - upload_csv_to_mongodb, - upload_file_to_mongodb, MongoDBConfig, + MongoDBHandler, UploadOptions, UploadResult, + upload_csv_to_mongodb, + upload_file_to_mongodb, ) __all__ = [ @@ -22,4 +22,3 @@ "UploadOptions", "UploadResult", ] - diff --git a/src/madengine/database/mongodb.py b/src/madengine/database/mongodb.py index 7713e991..e0d4bb48 100644 --- a/src/madengine/database/mongodb.py +++ b/src/madengine/database/mongodb.py @@ -21,7 +21,13 @@ from pymongo import UpdateOne from pymongo.errors import BulkWriteError, ConnectionFailure, PyMongoError from rich.console import Console -from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TaskProgressColumn +from rich.progress import ( + Progress, + SpinnerColumn, + TextColumn, + BarColumn, + TaskProgressColumn, +) logger = logging.getLogger(__name__) console = Console() @@ -31,62 +37,66 @@ # Configuration # ============================================================================ + @dataclass class MongoDBConfig: """MongoDB connection configuration.""" - + host: str = "localhost" port: int = 27017 username: str = "" password: str = "" auth_source: str = "admin" timeout_ms: int = 5000 - + @classmethod - def from_env(cls) -> 'MongoDBConfig': + def from_env(cls) -> "MongoDBConfig": """Load configuration from environment variables.""" import os + return cls( host=os.getenv("MONGO_HOST", "localhost"), port=int(os.getenv("MONGO_PORT", "27017")), username=os.getenv("MONGO_USER", ""), password=os.getenv("MONGO_PASSWORD", ""), auth_source=os.getenv("MONGO_AUTH_SOURCE", "admin"), - timeout_ms=int(os.getenv("MONGO_TIMEOUT_MS", "5000")) + timeout_ms=int(os.getenv("MONGO_TIMEOUT_MS", "5000")), ) - + @property def uri(self) -> str: """Build MongoDB connection URI.""" if self.username and self.password: - return (f"mongodb://{self.username}:{self.password}@" - f"{self.host}:{self.port}/{self.auth_source}") + return ( + f"mongodb://{self.username}:{self.password}@" + f"{self.host}:{self.port}/{self.auth_source}" + ) return f"mongodb://{self.host}:{self.port}" @dataclass class UploadOptions: """Options for document upload.""" - + # Deduplication strategy unique_fields: Optional[List[str]] = None # Fields to use for uniqueness upsert: bool = True # Update existing or insert only - + # Performance options batch_size: int = 1000 # Documents per batch ordered: bool = False # Continue on error - + # Index creation create_indexes: bool = True index_fields: Optional[List[str]] = None # Auto-detect if None - + # Metadata add_metadata: bool = True metadata_prefix: str = "_meta" - + # Validation validate_schema: bool = True - + # Dry run dry_run: bool = False @@ -94,7 +104,7 @@ class UploadOptions: @dataclass class UploadResult: """Result of upload operation.""" - + status: str # success, partial, failed documents_read: int documents_processed: int @@ -103,7 +113,7 @@ class UploadResult: documents_failed: int errors: List[str] = field(default_factory=list) duration_seconds: float = 0.0 - + def print_summary(self): """Print formatted summary.""" if self.status == "success": @@ -112,7 +122,7 @@ def print_summary(self): console.print(f"⚠️ [bold yellow]Partial success[/bold yellow]") else: console.print(f"❌ [bold red]Upload failed[/bold red]") - + console.print(f" 📊 Documents read: {self.documents_read}") console.print(f" ✨ Documents processed: {self.documents_processed}") console.print(f" ➕ Inserted: {self.documents_inserted}") @@ -126,20 +136,22 @@ def print_summary(self): # File Loaders (Strategy Pattern) # ============================================================================ + class FileFormat(Enum): """Supported file formats.""" + CSV = "csv" JSON = "json" class DocumentLoader(ABC): """Abstract base class for document loaders.""" - + @abstractmethod def load(self, file_path: Path) -> List[Dict[str, Any]]: """Load documents from file.""" pass - + @abstractmethod def infer_schema(self, documents: List[Dict[str, Any]]) -> Dict[str, type]: """Infer schema from documents.""" @@ -148,14 +160,14 @@ def infer_schema(self, documents: List[Dict[str, Any]]) -> Dict[str, type]: class JSONLoader(DocumentLoader): """Loader for JSON files with native type preservation.""" - + def load(self, file_path: Path) -> List[Dict[str, Any]]: """Load JSON file preserving native types.""" logger.info(f"Loading JSON file: {file_path}") - - with open(file_path, 'r') as f: + + with open(file_path, "r") as f: data = json.load(f) - + # Normalize to list if isinstance(data, dict): documents = [data] @@ -163,42 +175,42 @@ def load(self, file_path: Path) -> List[Dict[str, Any]]: documents = data else: raise ValueError(f"Expected JSON object or array, got {type(data)}") - + # Validate structure for i, doc in enumerate(documents): if not isinstance(doc, dict): raise ValueError(f"Document {i} is not a JSON object: {type(doc)}") - + logger.info(f"Loaded {len(documents)} documents from JSON") return documents - + def infer_schema(self, documents: List[Dict[str, Any]]) -> Dict[str, type]: """Infer schema from JSON documents.""" if not documents: return {} - + schema = {} sample_doc = documents[0] - + for key, value in sample_doc.items(): schema[key] = type(value) - + return schema class CSVLoader(DocumentLoader): """Loader for CSV files with intelligent type inference.""" - + def load(self, file_path: Path) -> List[Dict[str, Any]]: """Load CSV file with type inference.""" logger.info(f"Loading CSV file: {file_path}") - + # Read CSV with pandas (intelligent type inference) df = pd.read_csv(file_path) - + # Clean column names df.columns = df.columns.str.strip() - + # Convert to documents with native types preserved documents = [] for _, row in df.iterrows(): @@ -209,7 +221,7 @@ def load(self, file_path: Path) -> List[Dict[str, Any]]: if pd.isna(value): doc[col] = None # Try to parse JSON strings (for configs, multi_results) - elif isinstance(value, str) and value.strip().startswith(('{', '[')): + elif isinstance(value, str) and value.strip().startswith(("{", "[")): try: doc[col] = json.loads(value) except json.JSONDecodeError: @@ -217,44 +229,44 @@ def load(self, file_path: Path) -> List[Dict[str, Any]]: else: # Keep native type (int, float, bool, str) doc[col] = value if not pd.isna(value) else None - + documents.append(doc) - + logger.info(f"Loaded {len(documents)} documents from CSV") return documents - + def infer_schema(self, documents: List[Dict[str, Any]]) -> Dict[str, type]: """Infer schema from CSV documents.""" if not documents: return {} - + schema = {} sample_doc = documents[0] - + for key, value in sample_doc.items(): if value is None: schema[key] = type(None) else: schema[key] = type(value) - + return schema def detect_file_format(file_path: Path) -> FileFormat: """Detect file format from extension and content.""" - + extension = file_path.suffix.lower() - - if extension == '.json': + + if extension == ".json": return FileFormat.JSON - elif extension == '.csv': + elif extension == ".csv": return FileFormat.CSV - + # Content-based detection try: - with open(file_path, 'r') as f: + with open(file_path, "r") as f: first_char = f.read(1).strip() - if first_char in ['{', '[']: + if first_char in ["{", "["]: return FileFormat.JSON else: return FileFormat.CSV @@ -275,88 +287,93 @@ def get_loader(file_format: FileFormat) -> DocumentLoader: # Document Transformer # ============================================================================ + class DocumentTransformer: """Transform and enrich documents before upload.""" - + def __init__(self, options: UploadOptions): self.options = options - + def transform(self, documents: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """Transform documents with metadata and normalization.""" transformed = [] - + for doc in documents: # Add metadata if self.options.add_metadata: doc = self._add_metadata(doc) - + # Normalize types doc = self._normalize_types(doc) - + transformed.append(doc) - + return transformed - + def _add_metadata(self, doc: Dict[str, Any]) -> Dict[str, Any]: """Add metadata fields.""" prefix = self.options.metadata_prefix - + # Add upload timestamp if not present if f"{prefix}_uploaded_at" not in doc: doc[f"{prefix}_uploaded_at"] = datetime.now(timezone.utc) - + # Preserve original created_date if present if "created_date" not in doc: - doc["created_date"] = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S") - + doc["created_date"] = datetime.now(timezone.utc).strftime( + "%Y-%m-%d %H:%M:%S" + ) + return doc - + def _normalize_types(self, doc: Dict[str, Any]) -> Dict[str, Any]: """Normalize types for MongoDB compatibility.""" normalized = {} - + for key, value in doc.items(): # Handle numpy types (from pandas) - if hasattr(value, 'item'): # numpy scalar + if hasattr(value, "item"): # numpy scalar value = value.item() - + # Convert pandas Timestamp to datetime - if hasattr(value, 'to_pydatetime'): + if hasattr(value, "to_pydatetime"): value = value.to_pydatetime() - + # Keep None as None (not empty string) if pd.isna(value): value = None - + normalized[key] = value - + return normalized - + def infer_unique_fields(self, documents: List[Dict[str, Any]]) -> List[str]: """Intelligently infer unique identifier fields.""" if not documents: return [] - + # Common unique field patterns - candidate_fields = ['model', 'name', 'id', 'timestamp', 'date', 'pipeline'] - + candidate_fields = ["model", "name", "id", "timestamp", "date", "pipeline"] + available_fields = set(documents[0].keys()) unique_fields = [] - + for field in candidate_fields: if field in available_fields: # Check if field has unique values values = [doc.get(field) for doc in documents[:100]] # Sample - if len(set(str(v) for v in values if v is not None)) == len([v for v in values if v is not None]): + if len(set(str(v) for v in values if v is not None)) == len( + [v for v in values if v is not None] + ): unique_fields.append(field) break # Found a unique field - + # If no single unique field, try combinations - if not unique_fields and 'model' in available_fields: - unique_fields = ['model'] - if 'timestamp' in available_fields: - unique_fields.append('timestamp') - + if not unique_fields and "model" in available_fields: + unique_fields = ["model"] + if "timestamp" in available_fields: + unique_fields.append("timestamp") + return unique_fields @@ -364,90 +381,87 @@ def infer_unique_fields(self, documents: List[Dict[str, Any]]) -> List[str]: # MongoDB Uploader # ============================================================================ + class MongoDBUploader: """Handles MongoDB connection and bulk upload operations.""" - + def __init__(self, config: MongoDBConfig): self.config = config self.client: Optional[pymongo.MongoClient] = None - + def __enter__(self): """Context manager entry.""" self.connect() return self - + def __exit__(self, exc_type, exc_val, exc_tb): """Context manager exit.""" self.disconnect() - + def connect(self): """Establish MongoDB connection.""" logger.info(f"Connecting to MongoDB at {self.config.host}:{self.config.port}") - + self.client = pymongo.MongoClient( - self.config.uri, - serverSelectionTimeoutMS=self.config.timeout_ms + self.config.uri, serverSelectionTimeoutMS=self.config.timeout_ms ) - + # Test connection self.client.server_info() logger.info("✅ Connected to MongoDB") - + def disconnect(self): """Close MongoDB connection.""" if self.client: self.client.close() logger.info("Disconnected from MongoDB") - + def upload( self, documents: List[Dict[str, Any]], database_name: str, collection_name: str, - options: UploadOptions + options: UploadOptions, ) -> UploadResult: """Upload documents to MongoDB with bulk operations.""" - + start_time = datetime.now() - + # Get collection db = self.client[database_name] collection = db[collection_name] - + # Create indexes if requested if options.create_indexes: self._create_indexes(collection, documents, options) - + # Perform bulk upload result = self._bulk_upload(collection, documents, options) - + # Calculate duration result.duration_seconds = (datetime.now() - start_time).total_seconds() - + return result - + def _create_indexes( - self, - collection, - documents: List[Dict[str, Any]], - options: UploadOptions + self, collection, documents: List[Dict[str, Any]], options: UploadOptions ): """Create indexes for efficient querying.""" if not documents: return - + # Determine fields to index index_fields = options.index_fields or [] - + if not index_fields and options.unique_fields: index_fields = options.unique_fields - + # Auto-detect common index candidates if not index_fields: - common_index_fields = ['model', 'timestamp', 'date', 'status', 'pipeline'] + common_index_fields = ["model", "timestamp", "date", "status", "pipeline"] available = set(documents[0].keys()) index_fields = [f for f in common_index_fields if f in available] - + # Create indexes for field in index_fields: try: @@ -455,7 +469,7 @@ def _create_indexes( logger.info(f"Created index on field: {field}") except PyMongoError as e: logger.warning(f"Could not create index on {field}: {e}") - + # Create compound index for unique fields if options.unique_fields and len(options.unique_fields) > 1: try: @@ -464,20 +478,17 @@ def _create_indexes( logger.info(f"Created compound index on: {options.unique_fields}") except PyMongoError as e: logger.warning(f"Could not create compound index: {e}") - + def _bulk_upload( - self, - collection, - documents: List[Dict[str, Any]], - options: UploadOptions + self, collection, documents: List[Dict[str, Any]], options: UploadOptions ) -> UploadResult: """Perform bulk upload with batching.""" - + total_inserted = 0 total_updated = 0 total_failed = 0 errors = [] - + # Prepare bulk operations if options.upsert and options.unique_fields: operations = self._build_upsert_operations(documents, options.unique_fields) @@ -487,10 +498,10 @@ def _bulk_upload( result = collection.insert_many(documents, ordered=options.ordered) total_inserted = len(result.inserted_ids) except BulkWriteError as e: - total_inserted = e.details.get('nInserted', 0) - total_failed = len(e.details.get('writeErrors', [])) - errors = [err['errmsg'] for err in e.details.get('writeErrors', [])] - + total_inserted = e.details.get("nInserted", 0) + total_failed = len(e.details.get("writeErrors", [])) + errors = [err["errmsg"] for err in e.details.get("writeErrors", [])] + return UploadResult( status="success" if total_failed == 0 else "partial", documents_read=len(documents), @@ -498,44 +509,49 @@ def _bulk_upload( documents_inserted=total_inserted, documents_updated=0, documents_failed=total_failed, - errors=errors + errors=errors, ) - + # Batched bulk write for upsert operations batch_size = options.batch_size - + with Progress( SpinnerColumn(), TextColumn("[progress.description]{task.description}"), BarColumn(), TaskProgressColumn(), - console=console + console=console, ) as progress: - + task = progress.add_task( - f"Uploading to {collection.name}...", - total=len(operations) + f"Uploading to {collection.name}...", total=len(operations) ) - + for i in range(0, len(operations), batch_size): - batch = operations[i:i + batch_size] - + batch = operations[i : i + batch_size] + try: result = collection.bulk_write(batch, ordered=options.ordered) total_inserted += result.upserted_count total_updated += result.modified_count - + except BulkWriteError as e: - total_inserted += e.details.get('nUpserted', 0) - total_updated += e.details.get('nModified', 0) - write_errors = e.details.get('writeErrors', []) + total_inserted += e.details.get("nUpserted", 0) + total_updated += e.details.get("nModified", 0) + write_errors = e.details.get("writeErrors", []) total_failed += len(write_errors) - errors.extend([err['errmsg'] for err in write_errors[:5]]) # Limit error messages - + errors.extend( + [err["errmsg"] for err in write_errors[:5]] + ) # Limit error messages + progress.update(task, advance=len(batch)) - - status = "success" if total_failed == 0 else ("partial" if total_inserted + total_updated > 0 else "failed") - + + status = ( + "success" + if total_failed == 0 + else ("partial" if total_inserted + total_updated > 0 else "failed") + ) + return UploadResult( status=status, documents_read=len(documents), @@ -543,34 +559,26 @@ def _bulk_upload( documents_inserted=total_inserted, documents_updated=total_updated, documents_failed=total_failed, - errors=errors + errors=errors, ) - + def _build_upsert_operations( - self, - documents: List[Dict[str, Any]], - unique_fields: List[str] + self, documents: List[Dict[str, Any]], unique_fields: List[str] ) -> List[UpdateOne]: """Build bulk upsert operations.""" operations = [] - + for doc in documents: # Build filter from unique fields filter_doc = {field: doc[field] for field in unique_fields if field in doc} - + if not filter_doc: # No unique fields, skip or insert continue - + # Upsert operation - operations.append( - UpdateOne( - filter_doc, - {"$set": doc}, - upsert=True - ) - ) - + operations.append(UpdateOne(filter_doc, {"$set": doc}, upsert=True)) + return operations @@ -578,16 +586,17 @@ def _build_upsert_operations( # Main Upload Function # ============================================================================ + def upload_file_to_mongodb( file_path: str, database_name: str, collection_name: str, config: Optional[MongoDBConfig] = None, - options: Optional[UploadOptions] = None + options: Optional[UploadOptions] = None, ) -> UploadResult: """ Upload CSV or JSON file to MongoDB with intelligent handling. - + This is the main entry point for file uploads. Args: @@ -599,7 +608,7 @@ def upload_file_to_mongodb( Returns: UploadResult with operation details - + Raises: FileNotFoundError: If file doesn't exist ValueError: If file format is invalid @@ -609,43 +618,49 @@ def upload_file_to_mongodb( file_path = Path(file_path) if not file_path.exists(): raise FileNotFoundError(f"File not found: {file_path}") - + config = config or MongoDBConfig.from_env() options = options or UploadOptions() - + # Detect format and load documents file_format = detect_file_format(file_path) loader = get_loader(file_format) - - console.print(f"📂 Loading {file_format.value.upper()} file: [cyan]{file_path.name}[/cyan]") + + console.print( + f"📂 Loading {file_format.value.upper()} file: [cyan]{file_path.name}[/cyan]" + ) documents = loader.load(file_path) - + if not documents: raise ValueError(f"No documents found in {file_path}") - + console.print(f"✅ Loaded {len(documents)} documents") - + # Transform documents transformer = DocumentTransformer(options) - + # Infer unique fields if not specified if options.unique_fields is None: options.unique_fields = transformer.infer_unique_fields(documents) if options.unique_fields: - console.print(f"🔑 Auto-detected unique fields: [yellow]{', '.join(options.unique_fields)}[/yellow]") - + console.print( + f"🔑 Auto-detected unique fields: [yellow]{', '.join(options.unique_fields)}[/yellow]" + ) + documents = transformer.transform(documents) - + # Handle dry-run before connecting to MongoDB if options.dry_run: - console.print(f"\n🔍 [yellow]DRY RUN: Would upload {len(documents)} documents[/yellow]") + console.print( + f"\n🔍 [yellow]DRY RUN: Would upload {len(documents)} documents[/yellow]" + ) console.print(f" Database: {database_name}") console.print(f" Collection: {collection_name}") if options.unique_fields: console.print(f" Unique fields: {', '.join(options.unique_fields)}") console.print(f" Upsert: {options.upsert}") console.print(f" Create indexes: {options.create_indexes}") - + return UploadResult( status="success", documents_read=len(documents), @@ -653,18 +668,18 @@ def upload_file_to_mongodb( documents_inserted=0, documents_updated=0, documents_failed=0, - duration_seconds=0.0 + duration_seconds=0.0, ) - + # Upload to MongoDB with MongoDBUploader(config) as uploader: result = uploader.upload( documents=documents, database_name=database_name, collection_name=collection_name, - options=options + options=options, ) - + return result @@ -672,42 +687,45 @@ def upload_file_to_mongodb( # Legacy Compatibility # ============================================================================ + def upload_csv_to_mongodb( csv_file_path: str, database_name: str, collection_name: str, - mongo_config: Optional[MongoDBConfig] = None + mongo_config: Optional[MongoDBConfig] = None, ) -> Dict[str, Any]: """ Upload CSV data to MongoDB collection. - + DEPRECATED: Use upload_file_to_mongodb() instead. This function is kept for backward compatibility. - + Args: csv_file_path: Path to CSV file database_name: Name of MongoDB database collection_name: Name of MongoDB collection mongo_config: MongoDB configuration (uses environment if None) - + Returns: Dictionary with operation results """ - logger.warning("upload_csv_to_mongodb is deprecated. Use upload_file_to_mongodb instead.") - + logger.warning( + "upload_csv_to_mongodb is deprecated. Use upload_file_to_mongodb instead." + ) + result = upload_file_to_mongodb( file_path=csv_file_path, database_name=database_name, collection_name=collection_name, config=mongo_config, - options=UploadOptions() + options=UploadOptions(), ) - + # Convert UploadResult to legacy dict format return { "status": "success" if result.status == "success" else "partial", - "database": database_name, - "collection": collection_name, + "database": database_name, + "collection": collection_name, "records_processed": result.documents_processed, } @@ -715,7 +733,7 @@ def upload_csv_to_mongodb( class MongoDBHandler: """ Legacy handler class for MongoDB operations. - + DEPRECATED: This class is kept for backward compatibility. Use upload_file_to_mongodb() directly instead. """ @@ -728,15 +746,19 @@ def __init__(self, args): self.config = MongoDBConfig.from_env() self.database_name = args.database_name self.collection_name = args.collection_name - + # Support both old and new parameter names - self.file_path = getattr(args, 'file_path', None) or getattr(args, 'csv_file_path', None) - self.unique_key = getattr(args, 'unique_key', None) + self.file_path = getattr(args, "file_path", None) or getattr( + args, "csv_file_path", None + ) + self.unique_key = getattr(args, "unique_key", None) self.return_status = False def run(self) -> bool: """Execute the MongoDB upload operation.""" - logger.warning("MongoDBHandler is deprecated. Use upload_file_to_mongodb instead.") + logger.warning( + "MongoDBHandler is deprecated. Use upload_file_to_mongodb instead." + ) console.print("\n" + "=" * 80) console.print("[bold blue]📤 UPLOADING TO MONGODB[/bold blue]") @@ -744,24 +766,26 @@ def run(self) -> bool: console.print(f"📂 File: [cyan]{self.file_path}[/cyan]") console.print(f"🗄️ Database: [cyan]{self.database_name}[/cyan]") console.print(f"📊 Collection: [cyan]{self.collection_name}[/cyan]") - + try: # Parse unique fields if provided unique_fields = None if self.unique_key: - unique_fields = [k.strip() for k in self.unique_key.split(',')] - + unique_fields = [k.strip() for k in self.unique_key.split(",")] + options = UploadOptions(unique_fields=unique_fields) - + result = upload_file_to_mongodb( file_path=self.file_path, database_name=self.database_name, collection_name=self.collection_name, config=self.config, - options=options + options=options, ) - console.print(f"✅ [bold green]Successfully processed {result.documents_processed} documents[/bold green]") + console.print( + f"✅ [bold green]Successfully processed {result.documents_processed} documents[/bold green]" + ) console.print(f" Inserted: {result.documents_inserted}") console.print(f" Updated: {result.documents_updated}") console.print("=" * 80 + "\n") @@ -773,7 +797,9 @@ def run(self) -> bool: self.return_status = False except ConnectionFailure as e: console.print(f"[bold red]❌ MongoDB connection failed:[/bold red] {e}") - console.print("[yellow]💡 Tip: Check MONGO_HOST, MONGO_PORT, MONGO_USER, MONGO_PASSWORD[/yellow]") + console.print( + "[yellow]💡 Tip: Check MONGO_HOST, MONGO_PORT, MONGO_USER, MONGO_PASSWORD[/yellow]" + ) self.return_status = False except ValueError as e: console.print(f"[bold red]❌ Invalid file:[/bold red] {e}") diff --git a/src/madengine/deployment/__init__.py b/src/madengine/deployment/__init__.py index c48e99b8..02618530 100644 --- a/src/madengine/deployment/__init__.py +++ b/src/madengine/deployment/__init__.py @@ -13,12 +13,7 @@ Copyright (c) Advanced Micro Devices, Inc. All rights reserved. """ -from .base import ( - BaseDeployment, - DeploymentConfig, - DeploymentResult, - DeploymentStatus, -) +from .base import BaseDeployment, DeploymentConfig, DeploymentResult, DeploymentStatus from .factory import DeploymentFactory __all__ = [ @@ -28,4 +23,3 @@ "DeploymentStatus", "DeploymentFactory", ] - diff --git a/src/madengine/deployment/base.py b/src/madengine/deployment/base.py index a032c037..a94cbd4e 100644 --- a/src/madengine/deployment/base.py +++ b/src/madengine/deployment/base.py @@ -19,7 +19,6 @@ from jinja2 import Environment, FileSystemLoader from rich.console import Console - # Regex for parsing "performance: " log lines. # Value: optional sign, integer/decimal, scientific notation (e or E). # Separator: optional unit suffix (/[a-zA-Z]+) and/or comma, in any order — @@ -205,7 +204,9 @@ def execute(self) -> DeploymentResult: metrics = self.collect_results(result.deployment_id) result.metrics = metrics except Exception as e: - self.console.print(f"[yellow]Warning: Could not collect results for {result.deployment_id}: {e}[/yellow]") + self.console.print( + f"[yellow]Warning: Could not collect results for {result.deployment_id}: {e}[/yellow]" + ) # Ensure empty metrics dict exists even if collection fails result.metrics = {"successful_runs": [], "failed_runs": []} @@ -214,7 +215,9 @@ def execute(self) -> DeploymentResult: except KeyboardInterrupt: if result is not None and getattr(result, "deployment_id", None): self.cleanup(result.deployment_id) - self.console.print("\n[yellow]Cancelled deployment and cleaned up resources.[/yellow]") + self.console.print( + "\n[yellow]Cancelled deployment and cleaned up resources.[/yellow]" + ) raise except Exception as e: self.console.print(f"[red]Deployment error: {e}[/red]") @@ -239,13 +242,15 @@ def _monitor_until_complete(self, deployment_id: str) -> DeploymentResult: while True: status = self.monitor(deployment_id) - if status.status in [DeploymentStatus.SUCCESS, DeploymentStatus.FAILED, DeploymentStatus.UNKNOWN]: + if status.status in [ + DeploymentStatus.SUCCESS, + DeploymentStatus.FAILED, + DeploymentStatus.UNKNOWN, + ]: return status # Still running, wait and check again - self.console.print( - f" Status: {status.status.value} - {status.message}" - ) + self.console.print(f" Status: {status.status.value} - {status.message}") time.sleep(30) # Check every 30 seconds # Abstract methods to be implemented by subclasses @@ -388,7 +393,7 @@ def _parse_performance_from_log( return None value = float(match.group(1)) - metric = match.group(2).rstrip(',') + metric = match.group(2).rstrip(",") node_id_pattern = r"node_id:\s*(\d+)" node_match = re.search(node_id_pattern, log_content) @@ -503,7 +508,9 @@ def _aggregate_node_metrics( aggregated_value = sum(m["performance"] for m in per_node_metrics) method_desc = "sum_across_nodes" elif aggregation_method == "average": - aggregated_value = statistics.mean(m["performance"] for m in per_node_metrics) + aggregated_value = statistics.mean( + m["performance"] for m in per_node_metrics + ) method_desc = "average_across_nodes" elif aggregation_method == "max": aggregated_value = max(m["performance"] for m in per_node_metrics) @@ -543,7 +550,8 @@ def _aggregate_node_metrics( durations = [ m.get("duration", m.get("test_duration", "N/A")) for m in per_node_metrics - if m.get("duration", "N/A") != "N/A" or m.get("test_duration", "N/A") != "N/A" + if m.get("duration", "N/A") != "N/A" + or m.get("test_duration", "N/A") != "N/A" ] if durations: duration_values = [] @@ -558,7 +566,9 @@ def _aggregate_node_metrics( duration = "N/A" total_gpus = sum(m.get("local_gpus", 1) for m in per_node_metrics) - gpus_per_node = per_node_metrics[0].get("local_gpus", 1) if per_node_metrics else 1 + gpus_per_node = ( + per_node_metrics[0].get("local_gpus", 1) if per_node_metrics else 1 + ) aggregated_record = { "model": first_metric["model"], @@ -660,4 +670,3 @@ def _write_to_perf_csv(self, perf_data: Dict[str, Any]) -> None: writer = csv.DictWriter(f, fieldnames=headers, extrasaction="ignore") writer.writeheader() writer.writerow(row_to_write) - diff --git a/src/madengine/deployment/common.py b/src/madengine/deployment/common.py index 5b898960..e7574e31 100644 --- a/src/madengine/deployment/common.py +++ b/src/madengine/deployment/common.py @@ -21,7 +21,7 @@ "primus", "vllm", "sglang", - "sglang-disagg" + "sglang-disagg", ] # Tool names that use rocprof / rocprofv3 wrapping and need MPI-aware rocprofv3 on multi-node. @@ -98,10 +98,7 @@ def is_rocprofv3_available() -> bool: """ try: result = subprocess.run( - ["rocprofv3", "--help"], - capture_output=True, - text=True, - timeout=5 + ["rocprofv3", "--help"], capture_output=True, text=True, timeout=5 ) return result.returncode == 0 except (FileNotFoundError, subprocess.TimeoutExpired, OSError): @@ -109,9 +106,7 @@ def is_rocprofv3_available() -> bool: def configure_multi_node_profiling( - nnodes: int, - tools_config: List[Dict], - logger: Any + nnodes: int, tools_config: List[Dict], logger: Any ) -> Dict[str, Any]: """ Configure profiling for multi-node runs with rocprofv3 support. @@ -148,7 +143,7 @@ def configure_multi_node_profiling( "enabled": True, "mode": "single_node", "tools": tools_config, - "per_node_collection": False + "per_node_collection": False, } if not is_rocprofv3_available(): @@ -156,7 +151,8 @@ def configure_multi_node_profiling( filtered_tools: List[Dict] = [ t for t in tools_config - if isinstance(t, dict) and t.get("name") not in _ROCPROF_FAMILY_TOOL_NAMES + if isinstance(t, dict) + and t.get("name") not in _ROCPROF_FAMILY_TOOL_NAMES ] if filtered_tools: logger.warning( @@ -197,7 +193,7 @@ def configure_multi_node_profiling( "enabled": False, "mode": "multi_node_unsupported", "tools": [], - "per_node_collection": False + "per_node_collection": False, } logger.info( "Multi-node: rocprofv3 not found on submission host; keeping non-rocprof tools " @@ -210,7 +206,9 @@ def configure_multi_node_profiling( "per_node_collection": True, } - logger.info(f"✓ Multi-node profiling enabled for {nnodes} nodes (rocprofv3 detected)") + logger.info( + f"✓ Multi-node profiling enabled for {nnodes} nodes (rocprofv3 detected)" + ) upgraded_tools: List[Dict] = [] for tool in tools_config: @@ -232,9 +230,13 @@ def configure_multi_node_profiling( tool_names = [ t.get("name") if isinstance(t, dict) else str(t) for t in upgraded_tools ] - logger.info(f" → Multi-node profiling tools: {', '.join(filter(None, tool_names))}") + logger.info( + f" → Multi-node profiling tools: {', '.join(filter(None, tool_names))}" + ) if "rccl_trace" in tool_names: - logger.info(" → ✓ rccl_trace enabled (critical for multi-node communication profiling)") + logger.info( + " → ✓ rccl_trace enabled (critical for multi-node communication profiling)" + ) return { "enabled": True, @@ -242,5 +244,5 @@ def configure_multi_node_profiling( "tools": upgraded_tools, "per_node_collection": True, "profiler": "rocprofv3", - "wrapper_mode": "launcher" + "wrapper_mode": "launcher", } diff --git a/src/madengine/deployment/config_loader.py b/src/madengine/deployment/config_loader.py index 06d8a1b1..9e7ccf97 100644 --- a/src/madengine/deployment/config_loader.py +++ b/src/madengine/deployment/config_loader.py @@ -11,12 +11,14 @@ """ import json +from copy import deepcopy from pathlib import Path from typing import Any, Callable, Dict, Optional -from copy import deepcopy -def apply_deployment_config(config: Any, load_fn: Callable[[Dict[str, Any]], Dict[str, Any]]) -> Dict[str, Any]: +def apply_deployment_config( + config: Any, load_fn: Callable[[Dict[str, Any]], Dict[str, Any]] +) -> Dict[str, Any]: """Apply deployment defaults via a loader and set config.additional_context. Used by SLURM and Kubernetes deployment classes before calling super().__init__(config). @@ -35,97 +37,103 @@ def apply_deployment_config(config: Any, load_fn: Callable[[Dict[str, Any]], Dic class ConfigLoader: """Smart configuration loader with preset support.""" - + PRESET_DIR = Path(__file__).parent / "presets" - + @classmethod def load_preset(cls, preset_path: str) -> Dict[str, Any]: """ Load a preset JSON file. - + Args: preset_path: Relative path to preset file from PRESET_DIR - + Returns: Dict containing preset configuration, or empty dict if not found """ full_path = cls.PRESET_DIR / preset_path if not full_path.exists(): return {} - + try: - with open(full_path, 'r') as f: + with open(full_path, "r") as f: return json.load(f) except (json.JSONDecodeError, IOError) as e: print(f"Warning: Could not load preset {preset_path}: {e}") return {} - + @classmethod def deep_merge(cls, base: Dict, override: Dict) -> Dict: """ Deep merge two dictionaries. Override wins conflicts. Nested dicts are merged, lists/primitives are replaced. Special handling: env_vars are merged (not replaced). - + Args: base: Base dictionary override: Override dictionary - + Returns: Merged dictionary """ result = deepcopy(base) - + for key, value in override.items(): # Skip documentation/comment fields from base if override has them - if key.startswith('_'): + if key.startswith("_"): result[key] = deepcopy(value) continue - - if key in result and isinstance(result[key], dict) and isinstance(value, dict): + + if ( + key in result + and isinstance(result[key], dict) + and isinstance(value, dict) + ): # Recursively merge nested dicts result[key] = cls.deep_merge(result[key], value) else: # Replace with override value result[key] = deepcopy(value) - + return result - + @classmethod def detect_profile_needs(cls, config: Dict) -> Dict[str, bool]: """ Detect what profiles/optimizations are needed. - + Args: config: Configuration dictionary - + Returns: Dict with flags: is_single_gpu, is_multi_gpu, is_multi_node, is_distributed """ distributed = config.get("distributed", {}) gpu_count = config.get("k8s", {}).get("gpu_count", 1) nnodes = distributed.get("nnodes", 1) - - is_distributed = distributed.get("enabled", False) or distributed.get("launcher") + + is_distributed = distributed.get("enabled", False) or distributed.get( + "launcher" + ) is_multi_gpu = gpu_count > 1 or is_distributed is_multi_node = nnodes > 1 - + return { "is_single_gpu": gpu_count == 1 and not is_distributed, "is_multi_gpu": is_multi_gpu and not is_multi_node, "is_multi_node": is_multi_node, - "is_distributed": is_distributed + "is_distributed": is_distributed, } - + @classmethod def select_profile(cls, config: Dict, needs: Dict[str, bool]) -> Optional[str]: """ Auto-select k8s profile based on configuration needs. - + Args: config: Configuration dictionary needs: Profile needs from detect_profile_needs() - + Returns: Profile filename or None """ @@ -135,82 +143,82 @@ def select_profile(cls, config: Dict, needs: Dict[str, bool]) -> Optional[str]: return "k8s/profiles/multi-gpu.json" elif needs["is_single_gpu"]: return "k8s/profiles/single-gpu.json" - + return None - + @classmethod def load_k8s_config(cls, user_config: Dict[str, Any]) -> Dict[str, Any]: """ Load complete k8s configuration with multi-layer merging. - + Layers: 1. Base k8s defaults 2. GPU vendor base preset 3. GPU vendor multi-GPU preset (if needed) 4. Profile preset (single-gpu/multi-gpu/multi-node) 5. User configuration (already merged from file + CLI) - + Args: user_config: User-provided configuration (merged from file + CLI) - + Returns: Complete configuration with all defaults applied """ # Layer 1: Base defaults config = cls.load_preset("k8s/defaults.json") - + # Merge user config temporarily to detect requirements temp_config = cls.deep_merge(config, user_config) needs = cls.detect_profile_needs(temp_config) - + # Layer 2: GPU vendor base preset gpu_vendor = temp_config.get("gpu_vendor", "AMD").upper() vendor_file = f"k8s/gpu-vendors/{gpu_vendor.lower()}.json" vendor_preset = cls.load_preset(vendor_file) config = cls.deep_merge(config, vendor_preset) - + # Layer 3: GPU vendor multi-GPU optimizations (AMD only, when needed) if gpu_vendor == "AMD" and (needs["is_multi_gpu"] or needs["is_multi_node"]): amd_multi_preset = cls.load_preset("k8s/gpu-vendors/amd-multi-gpu.json") config = cls.deep_merge(config, amd_multi_preset) - + # Layer 4: Profile preset based on detected needs profile_file = cls.select_profile(temp_config, needs) if profile_file: profile_preset = cls.load_preset(profile_file) config = cls.deep_merge(config, profile_preset) - + # Layer 5: User configuration (highest priority) config = cls.deep_merge(config, user_config) - + return config - + @classmethod def load_slurm_config(cls, user_config: Dict[str, Any]) -> Dict[str, Any]: """ Load complete SLURM configuration with multi-layer merging. - + Layers: 1. Base SLURM defaults 2. Profile preset (single-node/multi-node) 3. User configuration (already merged from file + CLI) - + Args: user_config: User-provided configuration - + Returns: Complete configuration with defaults applied """ # Layer 1: Base defaults config = cls.load_preset("slurm/defaults.json") - + # Merge user config temporarily to detect requirements temp_config = cls.deep_merge(config, user_config) - + # Layer 2: Profile preset based on detected configuration slurm_config = temp_config.get("slurm", {}) nodes = slurm_config.get("nodes", 1) - + # Select profile based on node count if nodes > 1: profile_preset = cls.load_preset("slurm/profiles/multi-node.json") @@ -218,39 +226,39 @@ def load_slurm_config(cls, user_config: Dict[str, Any]) -> Dict[str, Any]: else: profile_preset = cls.load_preset("slurm/profiles/single-node.json") config = cls.deep_merge(config, profile_preset) - + # Layer 3: User configuration (highest priority) config = cls.deep_merge(config, user_config) - + return config - + @classmethod def infer_and_validate_deploy_type(cls, user_config: Dict[str, Any]) -> str: """ Infer deployment type from config structure and validate for conflicts. - + Convention over Configuration: Presence of k8s/slurm field indicates deployment intent. - + Args: user_config: User configuration dictionary - + Returns: Deployment type: "k8s", "slurm", or "local" - + Raises: ValueError: If conflicting deployment configs present """ has_k8s = "k8s" in user_config or "kubernetes" in user_config has_slurm = "slurm" in user_config explicit_deploy = user_config.get("deploy", "").lower() - + # Validation Rule 1: Can't have both k8s and slurm configs if has_k8s and has_slurm: raise ValueError( "Conflicting deployment configuration: Both 'k8s' and 'slurm' fields present. " "Please specify only one deployment target." ) - + # Validation Rule 2: If explicit deploy set, it must match config presence if explicit_deploy: if explicit_deploy in ["k8s", "kubernetes"] and not has_k8s: @@ -268,7 +276,7 @@ def infer_and_validate_deploy_type(cls, user_config: Dict[str, Any]) -> str: f"Conflicting deployment: 'deploy' field is 'local' but k8s/slurm config present. " "Remove k8s/slurm config for local execution." ) - + # Infer deployment type from config presence if has_k8s: return "k8s" @@ -276,34 +284,34 @@ def infer_and_validate_deploy_type(cls, user_config: Dict[str, Any]) -> str: return "slurm" else: return "local" - + @classmethod def load_config(cls, user_config: Dict[str, Any]) -> Dict[str, Any]: """ Load configuration with auto-inferred deploy type and validation. - + Infers deployment type from presence of k8s/slurm fields. Validates for conflicting configurations. Applies appropriate defaults based on deployment type. - + Convention over Configuration: - Presence of "k8s" field → Kubernetes deployment - Presence of "slurm" field → SLURM deployment - Neither present → Local execution - No explicit "deploy" field needed! - + Args: user_config: User configuration (from file + CLI merge) - + Returns: Complete configuration with defaults applied (no deploy field added) - + Raises: ValueError: If conflicting deployment configs present """ # Infer and validate deployment type deploy_type = cls.infer_and_validate_deploy_type(user_config) - + # Apply appropriate defaults based on deployment type # Note: We do NOT add a "deploy" field - type is inferred from structure if deploy_type == "k8s": @@ -313,4 +321,3 @@ def load_config(cls, user_config: Dict[str, Any]) -> Dict[str, Any]: else: # Local - return as-is (no deploy field needed) return user_config - diff --git a/src/madengine/deployment/factory.py b/src/madengine/deployment/factory.py index dea54557..833ae033 100644 --- a/src/madengine/deployment/factory.py +++ b/src/madengine/deployment/factory.py @@ -89,6 +89,7 @@ def register_default_deployments(): DeploymentFactory.register("kubernetes", KubernetesDeployment) except ImportError: import warnings + warnings.warn( "Kubernetes deployment target is unavailable: the 'kubernetes' library is not " "installed. Install it with: pip install madengine[kubernetes] " @@ -100,4 +101,3 @@ def register_default_deployments(): # Auto-register on module import register_default_deployments() - diff --git a/src/madengine/deployment/k8s_names.py b/src/madengine/deployment/k8s_names.py index bfa90569..cbbd9f41 100644 --- a/src/madengine/deployment/k8s_names.py +++ b/src/madengine/deployment/k8s_names.py @@ -36,7 +36,9 @@ def _trim_edges_alnum(s: str) -> str: return s or "x" -def sanitize_k8s_object_name(prefix: str, raw_model_name: str, max_total_len: int = _MAX_OBJECT_NAME_LEN) -> str: +def sanitize_k8s_object_name( + prefix: str, raw_model_name: str, max_total_len: int = _MAX_OBJECT_NAME_LEN +) -> str: """ Build a valid ``metadata.name`` substring from a model name. @@ -74,7 +76,9 @@ def sanitize_k8s_object_name(prefix: str, raw_model_name: str, max_total_len: in room = max_total_len - len(anchor) - 1 if room < 8: # Extreme: prefix alone too long — fall back to hash-only tail - return _trim_edges_alnum(f"{digest}-{hashlib.sha256(raw.encode()).hexdigest()[:20]}")[:max_total_len] + return _trim_edges_alnum( + f"{digest}-{hashlib.sha256(raw.encode()).hexdigest()[:20]}" + )[:max_total_len] tail = body[:room] if room > 0 else "" tail = _trim_edges_alnum(tail) if tail else "m" @@ -84,7 +88,9 @@ def sanitize_k8s_object_name(prefix: str, raw_model_name: str, max_total_len: in return _trim_edges_alnum(out) -def sanitize_k8s_container_name(name_hint: str, max_len: int = _MAX_DNS_LABEL_LEN) -> str: +def sanitize_k8s_container_name( + name_hint: str, max_len: int = _MAX_DNS_LABEL_LEN +) -> str: """ Sanitize for ``spec.containers[].name`` / initContainer names. diff --git a/src/madengine/deployment/kubernetes.py b/src/madengine/deployment/kubernetes.py index 927ec878..437a8ee1 100644 --- a/src/madengine/deployment/kubernetes.py +++ b/src/madengine/deployment/kubernetes.py @@ -35,7 +35,13 @@ from jinja2 import Template -from .base import BaseDeployment, DeploymentConfig, DeploymentResult, DeploymentStatus, create_jinja_env +from .base import ( + BaseDeployment, + DeploymentConfig, + DeploymentResult, + DeploymentStatus, + create_jinja_env, +) from .common import ( configure_multi_node_profiling, normalize_launcher, @@ -57,11 +63,19 @@ from madengine.core.errors import ConfigurationError from madengine.utils.gpu_config import resolve_runtime_gpus from madengine.utils.path_utils import get_madengine_root, scripts_base_dir_from -from madengine.utils.run_details import flatten_tags_in_place, get_build_number, get_pipeline +from madengine.utils.run_details import ( + flatten_tags_in_place, + get_build_number, + get_pipeline, +) try: from madengine.reporting.update_perf_csv import update_perf_csv - from madengine.reporting.update_perf_super import update_perf_super_json, update_perf_super_csv + from madengine.reporting.update_perf_super import ( + update_perf_super_json, + update_perf_super_csv, + ) + REPORTING_AVAILABLE = True except ImportError: REPORTING_AVAILABLE = False @@ -78,6 +92,8 @@ def _pod_job_name_label_selector(deployment_id: str) -> str: """Selector for the ``job-name`` pod label; value must be a valid ≤63-char label value.""" return f"job-name={sanitize_k8s_label_value(deployment_id)}" + + from .primus_backend import ( infer_primus_backend_from_model_name, infer_primus_examples_overlay_subdirs, @@ -107,7 +123,9 @@ def match_pvc_subdir_to_k8s_pod( return sorted(prefixed)[0] -def assign_pvc_subdirs_to_pods(pod_dirs: List[str], pod_names: List[str]) -> Dict[str, str]: +def assign_pvc_subdirs_to_pods( + pod_dirs: List[str], pod_names: List[str] +) -> Dict[str, str]: """ Assign each PVC subdir to at most one pod. Process longest names first so short prefixes do not steal pods (e.g. ``foo-0`` before ``foo``). @@ -162,8 +180,7 @@ def __init__(self, config: DeploymentConfig): if not YAML_AVAILABLE: raise ImportError( - "PyYAML library not installed.\n" - "Install with: pip install pyyaml" + "PyYAML library not installed.\n" "Install with: pip install pyyaml" ) apply_deployment_config(config, ConfigLoader.load_k8s_config) @@ -208,7 +225,9 @@ def __init__(self, config: DeploymentConfig): self.job_name = None self.job_label = None # pod label job-name + label selectors; ≤63 chars (sanitize_k8s_label_value) self.service_name = None # headless Service metadata.name + Pod subdomain; DNS label ≤63 (no dots) - self.main_container_name = None # same string as service_name (container names are DNS labels) + self.main_container_name = ( + None # same string as service_name (container names are DNS labels) + ) self.configmap_name = None self.configmap_yaml = None self.job_yaml = None @@ -347,9 +366,9 @@ def gather_system_env_details( ) -> None: """ Gather system environment details by adding rocEnvTool to pre-scripts. - + This ensures K8s deployment collects the same system info as local execution. - + Args: pre_scripts: List of pre-script configurations model_name: The model name (used for output file naming) @@ -357,18 +376,22 @@ def gather_system_env_details( # Add rocEnvTool pre-script with model-specific output name pre_env_details = { "path": "scripts/common/pre_scripts/run_rocenv_tool.sh", - "args": model_name.replace("/", "_") + "_env" + "args": model_name.replace("/", "_") + "_env", } pre_scripts.append(pre_env_details) - self.console.print(f"[dim]Added rocEnvTool to pre-scripts with args: {pre_env_details['args']}[/dim]") - - def _add_tool_scripts(self, pre_scripts: List[Dict], post_scripts: List[Dict]) -> None: + self.console.print( + f"[dim]Added rocEnvTool to pre-scripts with args: {pre_env_details['args']}[/dim]" + ) + + def _add_tool_scripts( + self, pre_scripts: List[Dict], post_scripts: List[Dict] + ) -> None: """ Add tool pre/post scripts to execution lists (similar to local execution). - + Extracts pre_scripts and post_scripts from tools.json definitions and adds them to the pre_scripts and post_scripts lists for execution in K8s pods. - + Args: pre_scripts: List to append tool pre-scripts to post_scripts: List to append tool post-scripts to @@ -376,95 +399,110 @@ def _add_tool_scripts(self, pre_scripts: List[Dict], post_scripts: List[Dict]) - tools_config = self._get_tools_config() if not tools_config: return - + # Load tools.json to get pre/post script definitions tools_json_path = get_madengine_root() / "scripts" / "common" / "tools.json" if not tools_json_path.exists(): return - + with open(tools_json_path, "r") as f: tools_definitions = json.load(f) - + # Add pre/post scripts from each configured tool for tool in tools_config: tool_name = tool.get("name") if not tool_name or tool_name not in tools_definitions.get("tools", {}): continue - + tool_def = tools_definitions["tools"][tool_name] - + # Add pre-scripts (at beginning, like local execution) if "pre_scripts" in tool_def: pre_scripts[:0] = tool_def["pre_scripts"] - + # Add post-scripts (at end, like local execution) if "post_scripts" in tool_def: post_scripts.extend(tool_def["post_scripts"]) - + def _load_common_scripts(self, script_list: List[Dict]) -> Dict[str, str]: """ Load common script contents from madengine package for embedding in ConfigMap. - + Since madengine is not installed in model Docker images, we need to embed the common scripts (pre_scripts, post_scripts, and tool wrapper scripts) in the ConfigMap. - + Args: script_list: List of script configurations with 'path' field - + Returns: Dict mapping relative script paths to their contents """ import os + script_contents = {} madengine_root = get_madengine_root() - + for script_config in script_list: script_path = script_config.get("path", "") if not script_path: continue - + # Convert to absolute path from madengine root abs_script_path = madengine_root / script_path - + if abs_script_path.exists() and abs_script_path.is_file(): with open(abs_script_path, "r") as f: script_contents[script_path] = f.read() self.console.print(f"[dim]Loaded common script: {script_path}[/dim]") - + # If it's run_rocenv_tool.sh, also load the entire rocEnvTool directory if "run_rocenv_tool.sh" in script_path: rocenv_dir = abs_script_path.parent / "rocEnvTool" if rocenv_dir.exists() and rocenv_dir.is_dir(): # Load all Python files for py_file in rocenv_dir.glob("*.py"): - rel_path = f"scripts/common/pre_scripts/rocEnvTool/{py_file.name}" + rel_path = ( + f"scripts/common/pre_scripts/rocEnvTool/{py_file.name}" + ) with open(py_file, "r") as f: script_contents[rel_path] = f.read() - self.console.print(f"[dim]Loaded rocEnvTool file: {rel_path}[/dim]") - + self.console.print( + f"[dim]Loaded rocEnvTool file: {rel_path}[/dim]" + ) + # Load all JSON files (e.g., env_tags.json) for json_file in rocenv_dir.glob("*.json"): rel_path = f"scripts/common/pre_scripts/rocEnvTool/{json_file.name}" with open(json_file, "r") as f: script_contents[rel_path] = f.read() - self.console.print(f"[dim]Loaded rocEnvTool file: {rel_path}[/dim]") + self.console.print( + f"[dim]Loaded rocEnvTool file: {rel_path}[/dim]" + ) else: - self.console.print(f"[yellow]Warning: Script not found: {script_path} (at {abs_script_path})[/yellow]") - + self.console.print( + f"[yellow]Warning: Script not found: {script_path} (at {abs_script_path})[/yellow]" + ) + # Load tool wrapper scripts if tools are configured tools_config = self._get_tools_config() if tools_config: - self._load_tool_wrapper_scripts(script_contents, tools_config, madengine_root) - + self._load_tool_wrapper_scripts( + script_contents, tools_config, madengine_root + ) + return script_contents - - def _load_tool_wrapper_scripts(self, script_contents: Dict[str, str], - tools_config: List[Dict], madengine_root: Path) -> None: + + def _load_tool_wrapper_scripts( + self, + script_contents: Dict[str, str], + tools_config: List[Dict], + madengine_root: Path, + ) -> None: """ Load tool wrapper scripts and tools.json for K8s ConfigMap. - + This enables profiling tools like rocprof to work in K8s deployments. - + Args: script_contents: Dict to populate with script contents tools_config: List of tool configurations from manifest @@ -475,28 +513,34 @@ def _load_tool_wrapper_scripts(self, script_contents: Dict[str, str], if tools_json_path.exists(): with open(tools_json_path, "r") as f: tools_definitions = json.load(f) - script_contents["scripts/common/tools.json"] = json.dumps(tools_definitions, indent=2) + script_contents["scripts/common/tools.json"] = json.dumps( + tools_definitions, indent=2 + ) self.console.print(f"[dim]Loaded tools.json[/dim]") else: - self.console.print(f"[yellow]Warning: tools.json not found at {tools_json_path}[/yellow]") + self.console.print( + f"[yellow]Warning: tools.json not found at {tools_json_path}[/yellow]" + ) return - + # Extract and load wrapper scripts referenced in tool commands for tool in tools_config: tool_name = tool.get("name") if not tool_name: continue - + # Get tool definition from tools.json if tool_name not in tools_definitions.get("tools", {}): - self.console.print(f"[yellow]Warning: Tool '{tool_name}' not found in tools.json[/yellow]") + self.console.print( + f"[yellow]Warning: Tool '{tool_name}' not found in tools.json[/yellow]" + ) continue - + tool_def = tools_definitions["tools"][tool_name] - + # Extract cmd - could be from tool config override or tool definition cmd = tool.get("cmd", tool_def.get("cmd", "")) - + # Check if cmd references a script in scripts/common/tools/ if "scripts/common/tools/" in cmd: # Parse script path from command (e.g., "bash ../scripts/common/tools/rocprof_wrapper.sh --runtime-trace") @@ -508,29 +552,43 @@ def _load_tool_wrapper_scripts(self, script_contents: Dict[str, str], # Remove ../ prefix if present script_rel_path = part.replace("../", "") abs_script_path = madengine_root / script_rel_path - + if abs_script_path.exists() and abs_script_path.is_file(): with open(abs_script_path, "r") as f: script_contents[script_rel_path] = f.read() - self.console.print(f"[dim]Loaded tool script: {script_rel_path}[/dim]") - + self.console.print( + f"[dim]Loaded tool script: {script_rel_path}[/dim]" + ) + # If it's a Python script, also load utility modules it might depend on - if script_rel_path.endswith('.py'): + if script_rel_path.endswith(".py"): tools_dir = abs_script_path.parent # Load common utility modules that profiling tools depend on - utility_modules = ['amd_smi_utils.py', 'rocm_smi_utils.py', 'pynvml_utils.py'] + utility_modules = [ + "amd_smi_utils.py", + "rocm_smi_utils.py", + "pynvml_utils.py", + ] for util_file in utility_modules: util_path = tools_dir / util_file if util_path.exists(): - util_rel_path = f"scripts/common/tools/{util_file}" + util_rel_path = ( + f"scripts/common/tools/{util_file}" + ) if util_rel_path not in script_contents: with open(util_path, "r") as f: - script_contents[util_rel_path] = f.read() - self.console.print(f"[dim]Loaded tool utility module: {util_rel_path}[/dim]") + script_contents[util_rel_path] = ( + f.read() + ) + self.console.print( + f"[dim]Loaded tool utility module: {util_rel_path}[/dim]" + ) else: - self.console.print(f"[yellow]Warning: Tool script not found: {script_rel_path} (at {abs_script_path})[/yellow]") + self.console.print( + f"[yellow]Warning: Tool script not found: {script_rel_path} (at {abs_script_path})[/yellow]" + ) break - + # Also load any tool-specific pre_scripts and post_scripts for script_config in tool_def.get("pre_scripts", []): script_path = script_config.get("path", "") @@ -539,8 +597,10 @@ def _load_tool_wrapper_scripts(self, script_contents: Dict[str, str], if abs_script_path.exists(): with open(abs_script_path, "r") as f: script_contents[script_path] = f.read() - self.console.print(f"[dim]Loaded tool pre-script: {script_path}[/dim]") - + self.console.print( + f"[dim]Loaded tool pre-script: {script_path}[/dim]" + ) + for script_config in tool_def.get("post_scripts", []): script_path = script_config.get("path", "") if script_path and script_path not in script_contents: @@ -548,8 +608,10 @@ def _load_tool_wrapper_scripts(self, script_contents: Dict[str, str], if abs_script_path.exists(): with open(abs_script_path, "r") as f: script_contents[script_path] = f.read() - self.console.print(f"[dim]Loaded tool post-script: {script_path}[/dim]") - + self.console.print( + f"[dim]Loaded tool post-script: {script_path}[/dim]" + ) + # NEW: Scan pre-scripts for dependencies on scripts/common/tools/ files # This handles cases like gpu_info_vram_profiler where the pre-script # calls python3 scripts/common/tools/gpu_info_profiler.py but the tool @@ -564,30 +626,51 @@ def _load_tool_wrapper_scripts(self, script_contents: Dict[str, str], script_content = f.read() # Look for references to scripts/common/tools/ in the pre-script import re + # Use non-capturing group (?:...) to avoid capturing just the ../ part - tool_refs = re.findall(r'(?:\.\./)?scripts/common/tools/[\w_]+\.py', script_content) + tool_refs = re.findall( + r"(?:\.\./)?scripts/common/tools/[\w_]+\.py", + script_content, + ) for tool_ref in tool_refs: # Clean up the path - tool_script_path = tool_ref.strip('"\'').replace("../", "") + tool_script_path = tool_ref.strip("\"'").replace( + "../", "" + ) abs_tool_path = madengine_root / tool_script_path - - if abs_tool_path.exists() and tool_script_path not in script_contents: + + if ( + abs_tool_path.exists() + and tool_script_path not in script_contents + ): with open(abs_tool_path, "r") as tf: script_contents[tool_script_path] = tf.read() - self.console.print(f"[dim]Loaded tool dependency: {tool_script_path}[/dim]") - + self.console.print( + f"[dim]Loaded tool dependency: {tool_script_path}[/dim]" + ) + # Also load utility modules for this Python script - if tool_script_path.endswith('.py'): + if tool_script_path.endswith(".py"): tools_dir = abs_tool_path.parent - utility_modules = ['amd_smi_utils.py', 'rocm_smi_utils.py', 'pynvml_utils.py'] + utility_modules = [ + "amd_smi_utils.py", + "rocm_smi_utils.py", + "pynvml_utils.py", + ] for util_file in utility_modules: util_path = tools_dir / util_file if util_path.exists(): - util_rel_path = f"scripts/common/tools/{util_file}" + util_rel_path = ( + f"scripts/common/tools/{util_file}" + ) if util_rel_path not in script_contents: with open(util_path, "r") as uf: - script_contents[util_rel_path] = uf.read() - self.console.print(f"[dim]Loaded utility module (from dependency): {util_rel_path}[/dim]") + script_contents[ + util_rel_path + ] = uf.read() + self.console.print( + f"[dim]Loaded utility module (from dependency): {util_rel_path}[/dim]" + ) def _bundle_primus_k8s_examples_overlay( self, model_scripts_contents: Dict[str, str], model_name: str = "" @@ -644,7 +727,9 @@ def _add_primus_file(host_file: Path) -> bool: req = primus_repo / "requirements.txt" if req.is_file(): if _add_primus_file(req): - self.console.print("[dim]Primus K8s: bundled Primus/requirements.txt[/dim]") + self.console.print( + "[dim]Primus K8s: bundled Primus/requirements.txt[/dim]" + ) ex_scripts = primus_repo / "examples" / "scripts" if ex_scripts.is_dir(): @@ -661,7 +746,9 @@ def _add_primus_file(host_file: Path) -> bool: run_pre = primus_repo / "examples" / "run_pretrain.sh" if run_pre.is_file(): if _add_primus_file(run_pre): - self.console.print("[dim]Primus K8s: bundled Primus/examples/run_pretrain.sh[/dim]") + self.console.print( + "[dim]Primus K8s: bundled Primus/examples/run_pretrain.sh[/dim]" + ) for sub in subdirs: base = primus_repo / "examples" / sub @@ -709,7 +796,7 @@ def _prepare_template_context( if credential_path.exists(): with open(credential_path, "r") as f: credential_content = f.read() - + # Load data.json content if exists data_json_content = None data_path = Path("data.json") @@ -720,17 +807,19 @@ def _prepare_template_context( # Load model scripts directory content (entire folder, not just one file) # This matches local execution which mounts the entire MODEL_DIR/scripts folder - model_script_path = model_info.get("scripts") # e.g., "scripts/dummy/run_data_minio.sh" + model_script_path = model_info.get( + "scripts" + ) # e.g., "scripts/dummy/run_data_minio.sh" model_script_dir = None model_script_filename = None model_scripts_contents = {} # Store all scripts in the directory - + if model_script_path: script_file = Path(model_script_path) # Extract directory and filename model_script_dir = str(script_file.parent) # e.g., "scripts/dummy" - model_script_filename = script_file.name # e.g., "run_data_minio.sh" - + model_script_filename = script_file.name # e.g., "run_data_minio.sh" + # Bundle entire scripts/ directory recursively for reliability across # different model types (vllm, sglang, etc.) with varying file types and subdirs scripts_dir_path = Path(model_script_dir) @@ -758,19 +847,23 @@ def _prepare_template_context( # Fallback: load single file if directory doesn't exist with open(script_file, "r") as f: model_scripts_contents[model_script_path] = f.read() - self.console.print(f"[dim]Loaded single script: {model_script_path}[/dim]") + self.console.print( + f"[dim]Loaded single script: {model_script_path}[/dim]" + ) else: - self.console.print(f"[yellow]Warning: Script not found: {model_script_path}[/yellow]") - + self.console.print( + f"[yellow]Warning: Script not found: {model_script_path}[/yellow]" + ) + # Load K8s tools configuration k8s_tools_config = self._load_k8s_tools() - + # Prepare data configuration first data_config = self._prepare_data_config(model_info) - + # Store for use in deploy() method self._data_config = data_config - + # K8s best practice: Auto-create shared data PVC if needed # K8s philosophy: Separate compute (pods) from storage (PVC) if data_config and not self.k8s_config.get("data_pvc"): @@ -790,7 +883,7 @@ def _prepare_template_context( ) # Set PVC name now so templates are rendered with correct value self.k8s_config["data_pvc"] = "madengine-shared-data" - + # Determine data provider script if model needs data data_provider_script = None data_provider_script_content = None @@ -798,16 +891,20 @@ def _prepare_template_context( provider_type = data_config.get("provider_type", "local") if provider_type in k8s_tools_config.get("data_providers", {}): data_provider_script = k8s_tools_config["data_providers"][provider_type] - + # Load K8s data provider script content k8s_script_path = get_madengine_root() / data_provider_script["script"] if k8s_script_path.exists(): with open(k8s_script_path, "r") as f: data_provider_script_content = f.read() - self.console.print(f"[dim]Loaded K8s data provider: {data_provider_script['script']}[/dim]") + self.console.print( + f"[dim]Loaded K8s data provider: {data_provider_script['script']}[/dim]" + ) else: - self.console.print(f"[yellow]Warning: K8s script not found: {k8s_script_path}[/yellow]") - + self.console.print( + f"[yellow]Warning: K8s script not found: {k8s_script_path}[/yellow]" + ) + # Get launcher configuration from manifest's deployment_config or additional_context deployment_config = self.manifest.get("deployment_config", {}) distributed_config = deployment_config.get("distributed", {}) @@ -816,86 +913,130 @@ def _prepare_template_context( # Merge manifest and runtime launcher config (runtime overrides) # Use explicit None checking to handle 0 values correctly launcher_type = ( - launcher_config.get("type") - if launcher_config.get("type") is not None + launcher_config.get("type") + if launcher_config.get("type") is not None else distributed_config.get("launcher") ) - + nnodes = ( launcher_config.get("nnodes") if launcher_config.get("nnodes") is not None else distributed_config.get("nnodes", 1) ) - + # Store for use in deploy() method self._nnodes = nnodes - + nproc_per_node = ( launcher_config.get("nproc_per_node") if launcher_config.get("nproc_per_node") is not None - else distributed_config.get("nproc_per_node") - if distributed_config.get("nproc_per_node") is not None - else int(model_info.get("n_gpus", 1)) + else ( + distributed_config.get("nproc_per_node") + if distributed_config.get("nproc_per_node") is not None + else int(model_info.get("n_gpus", 1)) + ) ) - + master_port = launcher_config.get("master_port", 29500) # Validate configuration if launcher_type == "torchrun": if not isinstance(nnodes, int) or nnodes < 1: - raise ValueError(f"Invalid nnodes: {nnodes}. Must be positive integer >= 1") + raise ValueError( + f"Invalid nnodes: {nnodes}. Must be positive integer >= 1" + ) if not isinstance(nproc_per_node, int) or nproc_per_node < 1: - raise ValueError(f"Invalid nproc_per_node: {nproc_per_node}. Must be positive integer >= 1") - - self.console.print(f"[cyan]Configuring torchrun: {nnodes} nodes × {nproc_per_node} GPUs/node[/cyan]") - + raise ValueError( + f"Invalid nproc_per_node: {nproc_per_node}. Must be positive integer >= 1" + ) + + self.console.print( + f"[cyan]Configuring torchrun: {nnodes} nodes × {nproc_per_node} GPUs/node[/cyan]" + ) + elif launcher_type == "deepspeed": if not isinstance(nnodes, int) or nnodes < 1: - raise ValueError(f"Invalid nnodes: {nnodes}. Must be positive integer >= 1") + raise ValueError( + f"Invalid nnodes: {nnodes}. Must be positive integer >= 1" + ) if not isinstance(nproc_per_node, int) or nproc_per_node < 1: - raise ValueError(f"Invalid nproc_per_node: {nproc_per_node}. Must be positive integer >= 1") - - self.console.print(f"[cyan]Configuring DeepSpeed: {nnodes} nodes × {nproc_per_node} GPUs/node[/cyan]") + raise ValueError( + f"Invalid nproc_per_node: {nproc_per_node}. Must be positive integer >= 1" + ) + + self.console.print( + f"[cyan]Configuring DeepSpeed: {nnodes} nodes × {nproc_per_node} GPUs/node[/cyan]" + ) elif launcher_type == "torchtitan": if not isinstance(nnodes, int) or nnodes < 1: - raise ValueError(f"Invalid nnodes: {nnodes}. Must be positive integer >= 1") + raise ValueError( + f"Invalid nnodes: {nnodes}. Must be positive integer >= 1" + ) if not isinstance(nproc_per_node, int) or nproc_per_node < 1: - raise ValueError(f"Invalid nproc_per_node: {nproc_per_node}. Must be positive integer >= 1") - - self.console.print(f"[cyan]Configuring TorchTitan: {nnodes} nodes × {nproc_per_node} GPUs/node[/cyan]") + raise ValueError( + f"Invalid nproc_per_node: {nproc_per_node}. Must be positive integer >= 1" + ) + + self.console.print( + f"[cyan]Configuring TorchTitan: {nnodes} nodes × {nproc_per_node} GPUs/node[/cyan]" + ) elif launcher_type == "vllm": if not isinstance(nnodes, int) or nnodes < 1: - raise ValueError(f"Invalid nnodes: {nnodes}. Must be positive integer >= 1") + raise ValueError( + f"Invalid nnodes: {nnodes}. Must be positive integer >= 1" + ) if not isinstance(nproc_per_node, int) or nproc_per_node < 1: - raise ValueError(f"Invalid nproc_per_node: {nproc_per_node}. Must be positive integer >= 1") - - self.console.print(f"[cyan]Configuring vLLM: {nnodes} nodes × {nproc_per_node} GPUs/node[/cyan]") + raise ValueError( + f"Invalid nproc_per_node: {nproc_per_node}. Must be positive integer >= 1" + ) + + self.console.print( + f"[cyan]Configuring vLLM: {nnodes} nodes × {nproc_per_node} GPUs/node[/cyan]" + ) elif launcher_type == "sglang": if not isinstance(nnodes, int) or nnodes < 1: - raise ValueError(f"Invalid nnodes: {nnodes}. Must be positive integer >= 1") + raise ValueError( + f"Invalid nnodes: {nnodes}. Must be positive integer >= 1" + ) if not isinstance(nproc_per_node, int) or nproc_per_node < 1: - raise ValueError(f"Invalid nproc_per_node: {nproc_per_node}. Must be positive integer >= 1") - - self.console.print(f"[cyan]Configuring SGLang: {nnodes} nodes × {nproc_per_node} GPUs/node[/cyan]") + raise ValueError( + f"Invalid nproc_per_node: {nproc_per_node}. Must be positive integer >= 1" + ) + + self.console.print( + f"[cyan]Configuring SGLang: {nnodes} nodes × {nproc_per_node} GPUs/node[/cyan]" + ) elif launcher_type == "megatron": if not isinstance(nnodes, int) or nnodes < 1: - raise ValueError(f"Invalid nnodes: {nnodes}. Must be positive integer >= 1") + raise ValueError( + f"Invalid nnodes: {nnodes}. Must be positive integer >= 1" + ) if not isinstance(nproc_per_node, int) or nproc_per_node < 1: - raise ValueError(f"Invalid nproc_per_node: {nproc_per_node}. Must be positive integer >= 1") - - self.console.print(f"[cyan]Configuring Megatron-LM: {nnodes} nodes × {nproc_per_node} GPUs/node[/cyan]") + raise ValueError( + f"Invalid nproc_per_node: {nproc_per_node}. Must be positive integer >= 1" + ) + + self.console.print( + f"[cyan]Configuring Megatron-LM: {nnodes} nodes × {nproc_per_node} GPUs/node[/cyan]" + ) elif launcher_type == "primus": if not isinstance(nnodes, int) or nnodes < 1: - raise ValueError(f"Invalid nnodes: {nnodes}. Must be positive integer >= 1") + raise ValueError( + f"Invalid nnodes: {nnodes}. Must be positive integer >= 1" + ) if not isinstance(nproc_per_node, int) or nproc_per_node < 1: - raise ValueError(f"Invalid nproc_per_node: {nproc_per_node}. Must be positive integer >= 1") - - self.console.print(f"[cyan]Configuring Primus: {nnodes} nodes × {nproc_per_node} GPUs/node[/cyan]") + raise ValueError( + f"Invalid nproc_per_node: {nproc_per_node}. Must be positive integer >= 1" + ) + + self.console.print( + f"[cyan]Configuring Primus: {nnodes} nodes × {nproc_per_node} GPUs/node[/cyan]" + ) self._bundle_primus_k8s_examples_overlay(model_scripts_contents, model_name) # Determine if we need multi-node setup @@ -905,32 +1046,38 @@ def _prepare_template_context( if launcher_type == "torchrun": if nnodes > 1: create_headless_service = True - self.console.print(f"[dim]Multi-node detected: Creating headless service for pod discovery[/dim]") - + self.console.print( + f"[dim]Multi-node detected: Creating headless service for pod discovery[/dim]" + ) + # Generate torchrun launcher command launcher_command = self._generate_torchrun_command( nnodes=nnodes, nproc_per_node=nproc_per_node, master_port=master_port, - model_script=model_info.get("scripts", "run.sh") + model_script=model_info.get("scripts", "run.sh"), ) - + elif launcher_type == "deepspeed": if nnodes > 1: create_headless_service = True - self.console.print(f"[dim]Multi-node DeepSpeed: Creating headless service for pod discovery[/dim]") - + self.console.print( + f"[dim]Multi-node DeepSpeed: Creating headless service for pod discovery[/dim]" + ) + model_script = model_info.get("scripts", "run.sh") - + # Check if script is a bash script - if so, execute it directly # as it will handle the launcher internally - if model_script.endswith('.sh'): - self.console.print(f"[dim]Detected bash script ({model_script}), will execute directly[/dim]") + if model_script.endswith(".sh"): + self.console.print( + f"[dim]Detected bash script ({model_script}), will execute directly[/dim]" + ) launcher_command = self._generate_bash_script_command( nnodes=nnodes, nproc_per_node=nproc_per_node, master_port=master_port, - model_script=model_script + model_script=model_script, ) else: # Python script - use DeepSpeed launcher @@ -938,27 +1085,31 @@ def _prepare_template_context( nnodes=nnodes, nproc_per_node=nproc_per_node, master_port=master_port, - model_script=model_script + model_script=model_script, ) elif launcher_type == "torchtitan": if nnodes > 1: create_headless_service = True - self.console.print(f"[dim]Multi-node TorchTitan: Creating headless service for pod discovery[/dim]") - + self.console.print( + f"[dim]Multi-node TorchTitan: Creating headless service for pod discovery[/dim]" + ) + # Generate TorchTitan launcher command launcher_command = self._generate_torchtitan_command( nnodes=nnodes, nproc_per_node=nproc_per_node, master_port=master_port, - model_script=model_info.get("scripts", "run.sh") + model_script=model_info.get("scripts", "run.sh"), ) elif launcher_type == "vllm": if nnodes > 1: create_headless_service = True - self.console.print(f"[dim]Multi-node vLLM: Creating headless service for Ray cluster[/dim]") - + self.console.print( + f"[dim]Multi-node vLLM: Creating headless service for Ray cluster[/dim]" + ) + # Generate vLLM launcher command (pass model args so run.sh gets --model_repo etc.) launcher_command = self._generate_vllm_command( nnodes=nnodes, @@ -971,8 +1122,10 @@ def _prepare_template_context( elif launcher_type == "sglang": if nnodes > 1: create_headless_service = True - self.console.print(f"[dim]Multi-node SGLang: Creating headless service for Ray cluster[/dim]") - + self.console.print( + f"[dim]Multi-node SGLang: Creating headless service for Ray cluster[/dim]" + ) + # Generate SGLang launcher command (pass model args so run.sh gets CLI args) launcher_command = self._generate_sglang_command( nnodes=nnodes, @@ -988,38 +1141,46 @@ def _prepare_template_context( f"SGLang Disaggregated requires minimum 3 nodes " f"(1 proxy + 1 prefill + 1 decode), got {nnodes}" ) - + # Always create headless service for disaggregated architecture create_headless_service = True - self.console.print(f"[dim]SGLang Disaggregated: Creating headless service for {nnodes} pods[/dim]") - self.console.print(f"[dim] Architecture: 1 proxy + {max(1, (nnodes-1)*2//5)} prefill + {nnodes-1-max(1, (nnodes-1)*2//5)} decode[/dim]") - + self.console.print( + f"[dim]SGLang Disaggregated: Creating headless service for {nnodes} pods[/dim]" + ) + self.console.print( + f"[dim] Architecture: 1 proxy + {max(1, (nnodes-1)*2//5)} prefill + {nnodes-1-max(1, (nnodes-1)*2//5)} decode[/dim]" + ) + # Generate SGLang Disaggregated launcher command launcher_command = self._generate_sglang_disagg_command( nnodes=nnodes, nproc_per_node=nproc_per_node, master_port=master_port, - model_script=model_info.get("scripts", "run.sh") + model_script=model_info.get("scripts", "run.sh"), ) elif launcher_type == "megatron": if nnodes > 1: create_headless_service = True - self.console.print(f"[dim]Multi-node Megatron-LM: Creating headless service for pod discovery[/dim]") - + self.console.print( + f"[dim]Multi-node Megatron-LM: Creating headless service for pod discovery[/dim]" + ) + # Generate Megatron-LM launcher command launcher_command = self._generate_megatron_command( nnodes=nnodes, nproc_per_node=nproc_per_node, master_port=master_port, - model_script=model_info.get("scripts", "run.sh") + model_script=model_info.get("scripts", "run.sh"), ) elif launcher_type == "primus": if nnodes > 1: create_headless_service = True - self.console.print(f"[dim]Multi-node Primus: Creating headless service for pod discovery[/dim]") - + self.console.print( + f"[dim]Multi-node Primus: Creating headless service for pod discovery[/dim]" + ) + # Generate Primus launcher command (env-only: PRIMUS_CONFIG_PATH, PRIMUS_CLI_EXTRA) launcher_command = self._generate_primus_command( nnodes=nnodes, @@ -1029,7 +1190,9 @@ def _prepare_template_context( model_args=model_info.get("args", "") or "", model_name=model_info.get("name", "") or "", ) - primus_cfg = merged_primus_config(self.manifest, self.config.additional_context) + primus_cfg = merged_primus_config( + self.manifest, self.config.additional_context + ) backend_hint = (primus_cfg.get("backend") or "").strip().lower() inferred_backend = infer_primus_backend_from_model_name( model_info.get("name", "") or "" @@ -1050,23 +1213,25 @@ def _prepare_template_context( # Prepare pre/post scripts (similar to local execution) pre_scripts = [] post_scripts = [] - + # Get pre/post scripts from manifest context if available if "context" in self.manifest: if "pre_scripts" in self.manifest["context"]: pre_scripts.extend(self.manifest["context"]["pre_scripts"]) if "post_scripts" in self.manifest["context"]: post_scripts.extend(self.manifest["context"]["post_scripts"]) - + # Add system environment collection (rocEnvTool) - same as local execution # This is controlled by generate_sys_env_details flag (default: True) - generate_sys_env_details = self.config.additional_context.get("generate_sys_env_details", True) + generate_sys_env_details = self.config.additional_context.get( + "generate_sys_env_details", True + ) if generate_sys_env_details: self.gather_system_env_details(pre_scripts, model_info["name"]) - + # Add tool pre/post scripts to the execution lists (like local execution) self._add_tool_scripts(pre_scripts, post_scripts) - + # Load pre/post script contents for ConfigMap (since madengine not installed in container) pre_post_script_contents = self._load_common_scripts(pre_scripts + post_scripts) @@ -1122,9 +1287,7 @@ def _prepare_template_context( # Job metadata "job_name": self.job_name, "job_label": self.job_label, - "main_container_name": getattr( - self, "main_container_name", None - ) + "main_container_name": getattr(self, "main_container_name", None) or sanitize_k8s_container_name(self.job_name), "namespace": self.namespace, "model_name": model_name, @@ -1169,7 +1332,9 @@ def _prepare_template_context( "host_ipc": nnodes > 1, # Enable for multi-node "subdomain": subdomain_val, # Execution - "gpu_visibility": ",".join(str(i) for i in range(gpu_count)), # e.g., "0" for 1 GPU, "0,1" for 2 GPUs + "gpu_visibility": ",".join( + str(i) for i in range(gpu_count) + ), # e.g., "0" for 1 GPU, "0,1" for 2 GPUs "gpu_architecture": self.manifest.get("context", {}).get( "gpu_architecture", "gfx90a" ), @@ -1184,7 +1349,7 @@ def _prepare_template_context( "env_vars": self._prepare_env_vars(model_info), # Volumes "results_pvc": f"{self.job_name}-results", # Always create a PVC for results - "pvc_name": f"{self.job_name}-results", # PVC name for template + "pvc_name": f"{self.job_name}-results", # PVC name for template "data_pvc": self.k8s_config.get("data_pvc"), # Multi-node "create_headless_service": create_headless_service, @@ -1195,9 +1360,13 @@ def _prepare_template_context( # Tools configuration - from manifest.context or additional_context "tools_config": self._get_tools_config(), # Tool command chains (pre-built for template) - "launcher_tool_chain": self._build_tool_command_chain( - self._get_tools_config(), "bash /tmp/run_launcher.sh" - ) if launcher_command else None, + "launcher_tool_chain": ( + self._build_tool_command_chain( + self._get_tools_config(), "bash /tmp/run_launcher.sh" + ) + if launcher_command + else None + ), "direct_script_tool_chain": self._build_tool_command_chain( self._get_tools_config(), f"bash {model_info.get('scripts', 'run.sh')}" ), @@ -1218,98 +1387,101 @@ def _prepare_template_context( ) return context - + def _get_tools_config(self) -> List[Dict]: """ Get tools configuration from manifest.context or additional_context. - + Prioritizes runtime additional_context, falls back to manifest.context. - + For multi-node runs: - Checks rocprofv3 availability (required for MPI profiling) - Upgrades "rocprof" to "rocprofv3" for multi-node compatibility - Logs warnings if rocprofv3 not available - + Returns: List of tool configurations (enriched with cmd from tools.json) """ # Cache the result to avoid repeated expensive checks and duplicate warnings - if hasattr(self, '_cached_tools_config'): + if hasattr(self, "_cached_tools_config"): return self._cached_tools_config - + # Check runtime additional_context first (allows runtime override) tools = self.config.additional_context.get("tools", []) - + # Fall back to manifest.context if no runtime tools if not tools and "context" in self.manifest: tools = self.manifest["context"].get("tools", []) - + # Apply multi-node profiling logic if applicable distributed_config = self.config.additional_context.get("distributed", {}) nnodes = distributed_config.get("nnodes", 1) - + if nnodes > 1 and tools: # Configure multi-node profiling (handles rocprofv3 detection and tool upgrades) # Create a simple logger wrapper for configure_multi_node_profiling class ConsoleLogger: def __init__(self, console): self.console = console + def info(self, msg): self.console.print(f"[cyan]{msg}[/cyan]") + def warning(self, msg): self.console.print(f"[yellow]{msg}[/yellow]") + def debug(self, msg): pass # Skip debug messages in console - + profiling_config = configure_multi_node_profiling( - nnodes=nnodes, - tools_config=tools, - logger=ConsoleLogger(self.console) + nnodes=nnodes, tools_config=tools, logger=ConsoleLogger(self.console) ) - + if profiling_config["enabled"]: tools = profiling_config["tools"] else: # rocprofv3 not available - skip profiling for multi-node tools = [] - + # Enrich tools with cmd from tools.json for K8s template usage result = self._enrich_tools_with_cmd(tools) - + # Cache the result for subsequent calls self._cached_tools_config = result return result - - def _build_tool_command_chain(self, tools_config: List[Dict], base_command: str) -> str: + + def _build_tool_command_chain( + self, tools_config: List[Dict], base_command: str + ) -> str: """ Build a command chain from multiple tools, wrapping the base command. - + Tools are chained from outermost to innermost: tool_n wraps tool_2 wraps tool_1 wraps base_command - + Each tool's OUTPUT_FILE env var is set inline to avoid conflicts. - + Args: tools_config: List of enriched tool configurations base_command: The base command to wrap (e.g., "bash /tmp/run_launcher.sh") - + Returns: Complete command chain string """ if not tools_config: return base_command - + # Filter tools that have a cmd field tools_with_cmd = [t for t in tools_config if t.get("cmd")] - + if not tools_with_cmd: return base_command - + # Build command chain from inside out (reverse order) cmd_chain = base_command for tool in reversed(tools_with_cmd): tool_cmd = tool["cmd"].replace("../scripts/common/", "scripts/common/") - + # Set OUTPUT_FILE inline for this specific tool (if defined in tool's env_vars) tool_env_vars = tool.get("env_vars", {}) if "OUTPUT_FILE" in tool_env_vars: @@ -1318,196 +1490,224 @@ def _build_tool_command_chain(self, tools_config: List[Dict], base_command: str) cmd_chain = f"OUTPUT_FILE={output_file} {tool_cmd} {cmd_chain}" else: cmd_chain = f"{tool_cmd} {cmd_chain}" - + return cmd_chain - + def _enrich_tools_with_cmd(self, tools: List[Dict]) -> List[Dict]: """ Enrich tools configuration with cmd field from tools.json. - + This is needed for K8s template to generate the correct encapsulation command. - + Args: tools: List of tool configurations (may only have 'name' field) - + Returns: Enriched list with 'cmd' field added from tools.json """ if not tools: return tools - + # Load tools.json - tools_json_path = Path(__file__).parent.parent / "scripts" / "common" / "tools.json" + tools_json_path = ( + Path(__file__).parent.parent / "scripts" / "common" / "tools.json" + ) if not tools_json_path.exists(): - self.console.print(f"[yellow]Warning: tools.json not found at {tools_json_path}[/yellow]") + self.console.print( + f"[yellow]Warning: tools.json not found at {tools_json_path}[/yellow]" + ) return tools - + with open(tools_json_path, "r") as f: tools_definitions = json.load(f) - + enriched_tools = [] for tool in tools: tool_name = tool.get("name") if not tool_name: enriched_tools.append(tool) continue - + # Get tool definition from tools.json if tool_name not in tools_definitions.get("tools", {}): - self.console.print(f"[yellow]Warning: Tool '{tool_name}' not found in tools.json[/yellow]") + self.console.print( + f"[yellow]Warning: Tool '{tool_name}' not found in tools.json[/yellow]" + ) enriched_tools.append(tool) continue - + tool_def = tools_definitions["tools"][tool_name] - + # Create enriched tool config with cmd enriched_tool = tool.copy() if "cmd" not in enriched_tool and "cmd" in tool_def: enriched_tool["cmd"] = tool_def["cmd"] - + # Also copy env_vars if present if "env_vars" not in enriched_tool and "env_vars" in tool_def: enriched_tool["env_vars"] = tool_def["env_vars"] - + enriched_tools.append(enriched_tool) - + return enriched_tools def _load_k8s_tools(self) -> Dict: """ Load K8s-specific tools configuration. - + Returns: Dict with K8s tools configuration """ k8s_tools_file = Path(__file__).parent.parent / "scripts" / "k8s" / "tools.json" - + if k8s_tools_file.exists(): try: with open(k8s_tools_file, "r") as f: return json.load(f) except Exception as e: - self.console.print(f"[yellow]Warning: Failed to load K8s tools config: {e}[/yellow]") + self.console.print( + f"[yellow]Warning: Failed to load K8s tools config: {e}[/yellow]" + ) return {} else: - self.console.print(f"[yellow]Warning: K8s tools.json not found at {k8s_tools_file}[/yellow]") + self.console.print( + f"[yellow]Warning: K8s tools.json not found at {k8s_tools_file}[/yellow]" + ) return {} - + def _prepare_env_vars(self, model_info: Dict) -> Dict[str, str]: """ Prepare environment variables from multiple sources. - + Merges env vars from: 1. Base additional_context 2. Data provider 3. Tools configuration - + Args: model_info: Model configuration - + Returns: Merged environment variables dict """ env_vars = {} - + # 1. Base environment variables from additional_context base_env = self.config.additional_context.get("env_vars", {}) env_vars.update(base_env) - + # 1b. Critical ROCm environment variable (if not already set) # HSA_NO_SCRATCH_RECLAIM=1 required for AMD MI300X and newer GPUs # Prevents performance degradation and NCCL errors if "HSA_NO_SCRATCH_RECLAIM" not in env_vars: env_vars["HSA_NO_SCRATCH_RECLAIM"] = "1" - + # 2. Data provider environment variables data_config = self._prepare_data_config(model_info) if data_config: if "env_vars" in data_config: # Exclude MAD_DATAHOME from data provider's env vars (we set it explicitly below for K8s) - data_provider_env = {k: v for k, v in data_config["env_vars"].items() if k != "MAD_DATAHOME"} + data_provider_env = { + k: v + for k, v in data_config["env_vars"].items() + if k != "MAD_DATAHOME" + } env_vars.update(data_provider_env) # Always set MAD_DATAHOME for K8s (PVC mount point /data, not /data_dlm_0) if "datahome" in data_config: env_vars["MAD_DATAHOME"] = data_config["datahome"] - + # 3. Tools configuration environment variables # Check both additional_context and manifest.context for tools tools_config = self.config.additional_context.get("tools", []) if not tools_config and "context" in self.manifest: tools_config = self.manifest["context"].get("tools", []) - + for tool in tools_config: if "env_vars" in tool: # Skip OUTPUT_FILE as it's set inline in command chain to avoid conflicts - tool_env_vars = {k: v for k, v in tool["env_vars"].items() if k != "OUTPUT_FILE"} + tool_env_vars = { + k: v for k, v in tool["env_vars"].items() if k != "OUTPUT_FILE" + } env_vars.update(tool_env_vars) - + return env_vars - + def _prepare_data_config(self, model_info: Dict) -> Optional[Dict]: """ Prepare data provider configuration for K8s pod. - + Args: model_info: Model configuration - + Returns: Data configuration dict or None """ if "data" not in model_info or not model_info["data"]: return None - + # Initialize data provider if needed if not self.data: try: # Create minimal context for data provider # We only need the data.json file to be present import os + data_json_file = "data.json" if os.path.exists(data_json_file): # Import Context and create minimal instance # Data provider needs this to function - self.context_for_data = type('obj', (object,), { - 'ctx': {}, - 'sh': lambda cmd: os.popen(cmd).read().strip() - })() + self.context_for_data = type( + "obj", + (object,), + {"ctx": {}, "sh": lambda cmd: os.popen(cmd).read().strip()}, + )() self.data = Data( self.context_for_data, filename=data_json_file, - force_mirrorlocal=False + force_mirrorlocal=False, ) else: - self.console.print("[yellow]Warning: data.json not found, data provider unavailable[/yellow]") + self.console.print( + "[yellow]Warning: data.json not found, data provider unavailable[/yellow]" + ) return None except Exception as e: - self.console.print(f"[yellow]Warning: Could not initialize data provider: {e}[/yellow]") + self.console.print( + f"[yellow]Warning: Could not initialize data provider: {e}[/yellow]" + ) return None - + try: # Get data environment variables data_env = self.data.get_env(model_info["data"]) - + # Find data provider for this data dp = self.data.find_dataprovider(model_info["data"]) if not dp: - self.console.print(f"[yellow]Warning: Data provider not found for {model_info['data']}[/yellow]") + self.console.print( + f"[yellow]Warning: Data provider not found for {model_info['data']}[/yellow]" + ) return None - + # Get provider type and source path - provider_type = dp.provider_type if hasattr(dp, 'provider_type') else "local" - source_url = dp.config.get("path", "") if hasattr(dp, 'config') else "" - + provider_type = ( + dp.provider_type if hasattr(dp, "provider_type") else "local" + ) + source_url = dp.config.get("path", "") if hasattr(dp, "config") else "" + # K8s best practice: Always use /data (PVC mount point) # PVC provides persistent, shared storage across all pods/nodes # Separation of storage (PVC) from compute (pods) is K8s standard # FORCE datahome to /data for K8s (override data provider's default /data_dlm_0) - + # Filter out MAD_DATAHOME from data provider env vars (will be set explicitly below) - filtered_data_env = {k: v for k, v in (data_env or {}).items() if k != "MAD_DATAHOME"} + filtered_data_env = { + k: v for k, v in (data_env or {}).items() if k != "MAD_DATAHOME" + } # Add MAD_DATAHOME with correct K8s value filtered_data_env["MAD_DATAHOME"] = "/data" - + return { "data_name": model_info["data"], "env_vars": filtered_data_env, @@ -1516,7 +1716,9 @@ def _prepare_data_config(self, model_info: Dict) -> Optional[Dict]: "datahome": "/data", # Always use PVC mount point for K8s } except Exception as e: - self.console.print(f"[yellow]Warning: Could not prepare data config: {e}[/yellow]") + self.console.print( + f"[yellow]Warning: Could not prepare data config: {e}[/yellow]" + ) return None def _save_debug_manifests(self): @@ -1534,9 +1736,7 @@ def _save_debug_manifests(self): if self.service_yaml: (output_dir / "service.yaml").write_text(self.service_yaml) - self.console.print( - f"[yellow]Debug: Manifests saved to {output_dir}[/yellow]" - ) + self.console.print(f"[yellow]Debug: Manifests saved to {output_dir}[/yellow]") def _k8s_data_storage_class(self) -> Optional[str]: """StorageClass for long-lived ``madengine-shared-data`` (NFS RWX recommended).""" @@ -1598,7 +1798,7 @@ def _create_results_pvc(self, nnodes: int = 1) -> str: storage_size=self.k8s_config.get("results_storage_size", "10Gi"), storage_class=storage_class, ) - + # Create PVC (retry on 409 "object is being deleted" until it is gone) pvc_dict = yaml.safe_load(pvc_yaml) max_create_retries = 6 @@ -1610,7 +1810,11 @@ def _create_results_pvc(self, nnodes: int = 1) -> str: ) return pvc_name except ApiException as e: - if e.status == 409 and e.body and "object is being deleted" in (e.body or ""): + if ( + e.status == 409 + and e.body + and "object is being deleted" in (e.body or "") + ): if attempt < max_create_retries - 1: self.console.print( f"[dim]PVC still terminating, waiting {create_wait_seconds}s before retry ({attempt + 1}/{max_create_retries})[/dim]" @@ -1620,7 +1824,7 @@ def _create_results_pvc(self, nnodes: int = 1) -> str: raise else: raise - + def _wait_for_pvc_deleted(self, pvc_name: str, max_wait: int = 90) -> None: """Block until the PVC is fully removed (or timeout).""" for i in range(max_wait): @@ -1744,7 +1948,7 @@ def _create_or_get_data_pvc(self, nnodes: int = 1) -> str: ) return pvc_name - + def _cleanup_existing_resources(self): """Delete existing Job, ConfigMap, and Service if they exist.""" # Delete existing Job @@ -1752,7 +1956,7 @@ def _cleanup_existing_resources(self): self.batch_v1.delete_namespaced_job( name=self.job_name, namespace=self.namespace, - propagation_policy="Background" + propagation_policy="Background", ) self.console.print(f"[dim]Deleted existing Job: {self.job_name}[/dim]") except ApiException as e: @@ -1766,62 +1970,64 @@ def _cleanup_existing_resources(self): ) except ApiException: pass - + # Delete existing ConfigMap try: self.core_v1.delete_namespaced_config_map( - name=self.configmap_name, - namespace=self.namespace + name=self.configmap_name, namespace=self.namespace + ) + self.console.print( + f"[dim]Deleted existing ConfigMap: {self.configmap_name}[/dim]" ) - self.console.print(f"[dim]Deleted existing ConfigMap: {self.configmap_name}[/dim]") except ApiException as e: if e.status != 404: pass - + # Delete existing Service - if hasattr(self, 'service_yaml') and self.service_yaml: + if hasattr(self, "service_yaml") and self.service_yaml: try: self.core_v1.delete_namespaced_service( - name=self.service_name, - namespace=self.namespace + name=self.service_name, namespace=self.namespace + ) + self.console.print( + f"[dim]Deleted existing Service: {self.service_name}[/dim]" ) - self.console.print(f"[dim]Deleted existing Service: {self.service_name}[/dim]") except ApiException as e: if e.status != 404: pass - + # Delete existing collector pod (must be done before PVC to allow PVC deletion) collector_pod_name = f"collector-{self.job_name}" try: self.core_v1.delete_namespaced_pod( name=collector_pod_name, namespace=self.namespace, - grace_period_seconds=0 + grace_period_seconds=0, + ) + self.console.print( + f"[dim]Deleted existing collector pod: {collector_pod_name}[/dim]" ) - self.console.print(f"[dim]Deleted existing collector pod: {collector_pod_name}[/dim]") # Wait a moment for pod to release the PVC time.sleep(2) except ApiException as e: if e.status != 404: pass - + # Delete existing PVC pvc_name = f"{self.job_name}-results" try: self.core_v1.delete_namespaced_persistent_volume_claim( - name=pvc_name, - namespace=self.namespace + name=pvc_name, namespace=self.namespace ) self.console.print(f"[dim]Deleted existing PVC: {pvc_name}[/dim]") - + # Wait for PVC to be fully deleted (not just marked for deletion) max_wait = 90 # Maximum 90 seconds (PV can take time to detach) wait_interval = 1 # Check every 1 second for i in range(max_wait): try: self.core_v1.read_namespaced_persistent_volume_claim( - name=pvc_name, - namespace=self.namespace + name=pvc_name, namespace=self.namespace ) if i > 0 and i % 10 == 0: self.console.print( @@ -1835,7 +2041,7 @@ def _cleanup_existing_resources(self): except ApiException as e: if e.status != 404: pass - + # Wait a moment for other resources to be deleted time.sleep(1) @@ -1844,22 +2050,22 @@ def deploy(self) -> DeploymentResult: try: # Clean up any existing resources first self._cleanup_existing_resources() - + # 1. Create PVC for results storage self.console.print("[blue]Creating PVC for results storage...[/blue]") nnodes_deploy = getattr(self, "_nnodes", 1) pvc_name = self._create_results_pvc(nnodes=nnodes_deploy) self.console.print(f"[green]✓ Created PVC: {pvc_name}[/green]") - + # 1b. Create or reuse data PVC if data provider is configured and auto-creation was flagged - if hasattr(self, '_data_config') and self._data_config: + if hasattr(self, "_data_config") and self._data_config: # Check if we set the PVC name during prepare (auto-creation case) data_pvc_name = self.k8s_config.get("data_pvc") if data_pvc_name == "madengine-shared-data": # Auto-creation mode: create/reuse the PVC - nnodes = getattr(self, '_nnodes', 1) + nnodes = getattr(self, "_nnodes", 1) self._create_or_get_data_pvc(nnodes=nnodes) - + # 2. Create Secrets from local credential.json (strategy: from_local_credentials) merged_sec = merge_secrets_config(self.k8s_config) strategy = merged_sec.get("strategy", SECRETS_STRATEGY_FROM_LOCAL) @@ -1890,7 +2096,9 @@ def deploy(self) -> DeploymentResult: self.core_v1.create_namespaced_service( namespace=self.namespace, body=service_dict ) - self.console.print(f"[green]✓ Created Service: {self.service_name}[/green]") + self.console.print( + f"[green]✓ Created Service: {self.service_name}[/green]" + ) # 5. Create Job self.console.print("[blue]Creating Job...[/blue]") @@ -1928,18 +2136,18 @@ def deploy(self) -> DeploymentResult: def monitor(self, deployment_id: str) -> DeploymentResult: """ Monitor Job status using Python API. - + If live_output is enabled, streams pod logs in real-time. Otherwise, polls status periodically. """ # Check if live output is requested live_output = self.config.additional_context.get("live_output", False) - + if live_output: return self._monitor_with_live_logs(deployment_id) else: return self._monitor_status_only(deployment_id) - + def _monitor_status_only(self, deployment_id: str) -> DeploymentResult: """Monitor Job status without streaming logs.""" try: @@ -1985,21 +2193,23 @@ def _monitor_status_only(self, deployment_id: str) -> DeploymentResult: message=f"Job {deployment_id} not found", ) raise - + def _monitor_with_live_logs(self, deployment_id: str) -> DeploymentResult: """Monitor Job and stream logs in real-time.""" - self.console.print(f"\n[cyan]═══ Streaming pod logs (--live-output) ═══[/cyan]\n") - + self.console.print( + f"\n[cyan]═══ Streaming pod logs (--live-output) ═══[/cyan]\n" + ) + pod_name = None log_position = 0 - + while True: try: # Check job status job = self.batch_v1.read_namespaced_job_status( name=deployment_id, namespace=self.namespace ) - + # Get pod if we don't have it yet if not pod_name: pods = self.core_v1.list_namespaced_pod( @@ -2008,8 +2218,10 @@ def _monitor_with_live_logs(self, deployment_id: str) -> DeploymentResult: ) if pods.items: pod_name = pods.items[0].metadata.name - self.console.print(f"[dim]Following logs from pod: {pod_name}[/dim]\n") - + self.console.print( + f"[dim]Following logs from pod: {pod_name}[/dim]\n" + ) + # Stream logs if we have a pod if pod_name: try: @@ -2017,31 +2229,33 @@ def _monitor_with_live_logs(self, deployment_id: str) -> DeploymentResult: logs = self.core_v1.read_namespaced_pod_log( name=pod_name, namespace=self.namespace, - tail_lines=100 if log_position == 0 else None + tail_lines=100 if log_position == 0 else None, ) - + # Print new log lines and trigger artifact collection if logs: - log_lines = logs.split('\n') + log_lines = logs.split("\n") if len(log_lines) > log_position: for line in log_lines[log_position:]: if line.strip(): print(line) log_position = len(log_lines) - + except ApiException as e: if e.status != 400: # Ignore "container not ready" errors pass - + # Check if job completed if job.status.succeeded: - self.console.print(f"\n[green]✓ Job {deployment_id} completed successfully[/green]\n") + self.console.print( + f"\n[green]✓ Job {deployment_id} completed successfully[/green]\n" + ) return DeploymentResult( status=DeploymentStatus.SUCCESS, deployment_id=deployment_id, message=f"Job {deployment_id} completed successfully", ) - + if job.status.failed: self.console.print(f"\n[red]✗ Job {deployment_id} failed[/red]\n") # Print final logs @@ -2052,9 +2266,9 @@ def _monitor_with_live_logs(self, deployment_id: str) -> DeploymentResult: deployment_id=deployment_id, message=f"Job {deployment_id} failed", ) - + time.sleep(2) # Poll every 2 seconds - + except ApiException as e: if e.status == 404: return DeploymentResult( @@ -2063,24 +2277,22 @@ def _monitor_with_live_logs(self, deployment_id: str) -> DeploymentResult: message=f"Job {deployment_id} not found", ) raise - + def _print_pod_logs_on_failure(self, deployment_id: str): """Print pod logs when job fails (for debugging).""" try: self.console.print(f"\n[yellow]═══ Pod logs (last 50 lines) ═══[/yellow]\n") - + pods = self.core_v1.list_namespaced_pod( namespace=self.namespace, label_selector=_pod_job_name_label_selector(deployment_id), ) - + for pod in pods.items: pod_name = pod.metadata.name try: logs = self.core_v1.read_namespaced_pod_log( - name=pod_name, - namespace=self.namespace, - tail_lines=50 + name=pod_name, namespace=self.namespace, tail_lines=50 ) self.console.print(f"[dim]Pod: {pod_name}[/dim]") print(logs) @@ -2135,17 +2347,17 @@ def _refresh_pod_until_terminal_phase( def collect_results(self, deployment_id: str) -> Dict[str, Any]: """ Enhanced results collection from K8s pods following vLLM multi-node best practices. - + For Data Parallel deployments (vLLM, SGLang): - Each pod runs an independent replica - Only pod-0 reports metrics to avoid duplicates - Total throughput = pod-0 throughput × num_replicas - + Collects: 1. Pod logs (``k8s_results///pod.log``) 2. PVC mirror per pod (``...//pvc/``), mapped from ``/results//`` 3. File artifacts via kubectl cp when pods are still running (keep-alive path) - + Returns: Dict with logs, artifacts, and performance results """ @@ -2161,8 +2373,10 @@ def collect_results(self, deployment_id: str) -> Dict[str, Any]: # Create results directory for this deployment results_dir = Path(f"./k8s_results/{deployment_id}") results_dir.mkdir(parents=True, exist_ok=True) - - self.console.print(f"[cyan]📦 Collecting results from K8s job: {deployment_id}[/cyan]") + + self.console.print( + f"[cyan]📦 Collecting results from K8s job: {deployment_id}[/cyan]" + ) try: # Get pods for this job @@ -2178,7 +2392,7 @@ def collect_results(self, deployment_id: str) -> Dict[str, Any]: model_info = self.manifest["built_models"][model_key] else: model_info = {} - + # Get build info from built_images image_keys = list(self.manifest.get("built_images", {}).keys()) if image_keys: @@ -2193,21 +2407,21 @@ def collect_results(self, deployment_id: str) -> Dict[str, Any]: is_distributed = distributed_config.get("enabled", False) nnodes = distributed_config.get("nnodes", 1) is_multinode = is_distributed and nnodes > 1 - + # Determine launcher_type the same way as _prepare_template_context does # (deployment_config doesn't store launcher_type directly) launcher_config = self.config.additional_context.get("launcher", {}) launcher_type = ( - launcher_config.get("type") - if launcher_config.get("type") is not None + launcher_config.get("type") + if launcher_config.get("type") is not None else distributed_config.get("launcher") ) - + # Normalize launcher based on deployment type and validity launcher_type = normalize_launcher(launcher_type, "kubernetes") - + is_ray_launcher = launcher_type in ["vllm", "sglang"] - + # Sort pods by name to ensure consistent ordering (pod-0 is master) sorted_pods = sorted(pods.items, key=lambda p: p.metadata.name) @@ -2217,31 +2431,33 @@ def collect_results(self, deployment_id: str) -> Dict[str, Any]: # Parse performance from ALL nodes (each reports node-local metrics) # Aggregate metrics based on type (sum for throughput, etc.) # ======================================================================== - + per_node_metrics = [] # Store performance from each node results["nodes"] = [] # Store per-node details for display - + # Special handling for Ray-based launchers (vLLM, SGLang) # These report per-replica metrics, need scaling if is_multinode and is_ray_launcher: self.console.print( f"[cyan]Multi-node Ray deployment: {nnodes} nodes (Data Parallel mode)[/cyan]" ) - + # Collect from ALL pods for pod_index, pod in enumerate(sorted_pods): pod_name = pod.metadata.name pod_dir = results_dir / pod_name pod_dir.mkdir(exist_ok=True) - + # Extract node rank from pod name (e.g., madengine-dummy-torchrun-0 -> 0) try: - node_rank = int(pod_name.rsplit('-', 1)[-1]) + node_rank = int(pod_name.rsplit("-", 1)[-1]) except (ValueError, IndexError): node_rank = pod_index - - self.console.print(f"[dim] Collecting from pod: {pod_name} (node-{node_rank})[/dim]") - + + self.console.print( + f"[dim] Collecting from pod: {pod_name} (node-{node_rank})[/dim]" + ) + try: # 1. Collect pod logs log = self.core_v1.read_namespaced_pod_log( @@ -2249,37 +2465,41 @@ def collect_results(self, deployment_id: str) -> Dict[str, Any]: ) log_file = pod_dir / "pod.log" log_file.write_text(log) - results["logs"].append({ - "pod": pod_name, - "log": log, - "file": str(log_file) - }) - + results["logs"].append( + {"pod": pod_name, "log": log, "file": str(log_file)} + ) + # 2. Parse NODE-LOCAL performance from log perf_data = self._parse_performance_from_log( log, model_info.get("name", "") ) - + # Pod phase/exit can lag right after Job success; poll until terminal or timeout pod = self._refresh_pod_until_terminal_phase(pod_name) pod_status = pod.status.phase if pod else "Unknown" pod_exit_code = ( self._primary_workload_container_exit_code(pod) if pod else -1 ) - + # Store per-node info for display table node_info = { "node_id": node_rank, "pod_name": pod_name, - "status": "SUCCESS" if pod_status == "Succeeded" and pod_exit_code == 0 else "FAILED", + "status": ( + "SUCCESS" + if pod_status == "Succeeded" and pod_exit_code == 0 + else "FAILED" + ), "exit_code": pod_exit_code, - "performance": perf_data.get("performance") if perf_data else None, + "performance": ( + perf_data.get("performance") if perf_data else None + ), "metric": perf_data.get("metric") if perf_data else None, "duration": perf_data.get("duration") if perf_data else None, - "log_file": str(log_file) + "log_file": str(log_file), } results["nodes"].append(node_info) - + if perf_data: # For Ray launchers, this is per-replica metric if is_multinode and is_ray_launcher: @@ -2293,42 +2513,48 @@ def collect_results(self, deployment_id: str) -> Dict[str, Any]: self.console.print( f"[dim] No performance metric found in node-{node_rank} log[/dim]" ) - + except ApiException as e: self.console.print( f"[red]✗ Failed to get logs for pod {pod_name}: {e.reason}[/red]" ) - results["nodes"].append({ - "node_id": node_rank, - "pod_name": pod_name, - "status": "FAILED", - "exit_code": -1, - "performance": None, - "metric": None, - "error": f"Failed to get logs: {e.reason}" - }) + results["nodes"].append( + { + "node_id": node_rank, + "pod_name": pod_name, + "status": "FAILED", + "exit_code": -1, + "performance": None, + "metric": None, + "error": f"Failed to get logs: {e.reason}", + } + ) except Exception as e: self.console.print( f"[red]✗ Error collecting from pod {pod_name}: {e}[/red]" ) - results["nodes"].append({ - "node_id": node_rank, - "pod_name": pod_name, - "status": "FAILED", - "exit_code": -1, - "performance": None, - "metric": None, - "error": str(e) - }) - + results["nodes"].append( + { + "node_id": node_rank, + "pod_name": pod_name, + "status": "FAILED", + "exit_code": -1, + "performance": None, + "metric": None, + "error": str(e), + } + ) + self.console.print( f"[green]✓ Collected logs from {len(results['logs'])} pods[/green]" ) - + # Collect artifacts from PVC before deciding success/failure (needed for multiple_results fallback) k8s_pod_names = [p.metadata.name for p in sorted_pods] - self._collect_from_pvc(deployment_id, results_dir, results, pod_names=k8s_pod_names) - + self._collect_from_pvc( + deployment_id, results_dir, results, pod_names=k8s_pod_names + ) + # ======================================================================== # Aggregate per-node metrics # ======================================================================== @@ -2343,7 +2569,7 @@ def collect_results(self, deployment_id: str) -> Dict[str, Any]: self.console.print( f"[green] Total capacity: {aggregated_perf:.1f} req/s ({nnodes} nodes)[/green]" ) - + # Create aggregated record manually for Ray aggregated_record = { "model": per_node_metrics[0]["model"], @@ -2354,21 +2580,23 @@ def collect_results(self, deployment_id: str) -> Dict[str, Any]: "nnodes": nnodes, "launcher": launcher_type or "N/A", "deployment_type": "kubernetes", - "gpu_architecture": per_node_metrics[0].get("gpu_architecture", "N/A"), + "gpu_architecture": per_node_metrics[0].get( + "gpu_architecture", "N/A" + ), "duration": per_node_metrics[0].get("duration", "N/A"), "data_name": per_node_metrics[0].get("data_name", "N/A"), - "data_provider": per_node_metrics[0].get("data_provider", "N/A"), + "data_provider": per_node_metrics[0].get( + "data_provider", "N/A" + ), "aggregation_method": "scaled_by_nnodes", - "nodes_contributing": nnodes + "nodes_contributing": nnodes, } else: # Use new aggregation logic for other launchers aggregated_record = self._aggregate_node_metrics( - per_node_metrics, - nnodes, - launcher_type + per_node_metrics, nnodes, launcher_type ) - + if aggregated_record: # Full reporting pipeline: perf_entry at project root, then update_* (same as local/SLURM) self._ensure_perf_csv_exists() @@ -2379,9 +2607,13 @@ def collect_results(self, deployment_id: str) -> Dict[str, Any]: with open(perf_entry_path, "w", encoding="utf-8") as f: json.dump(run_details_dict, f, indent=2) if run_details_dict.get("status") == "SUCCESS": - update_perf_csv(perf_csv="perf.csv", single_result=str(perf_entry_path)) + update_perf_csv( + perf_csv="perf.csv", single_result=str(perf_entry_path) + ) else: - update_perf_csv(perf_csv="perf.csv", exception_result=str(perf_entry_path)) + update_perf_csv( + perf_csv="perf.csv", exception_result=str(perf_entry_path) + ) scripts_path = model_info.get("scripts", "") scripts_base_dir = scripts_base_dir_from(scripts_path) try: @@ -2403,13 +2635,17 @@ def collect_results(self, deployment_id: str) -> Dict[str, Any]: num_entries=num_entries, ) except Exception as e: - self.console.print(f"[yellow]⚠ Could not update perf_super: {e}[/yellow]") - results["successful_runs"].append({ - "model": model_info.get("name"), - "perf_data": aggregated_record, - "nodes": results["nodes"], - "per_node_metrics": per_node_metrics - }) + self.console.print( + f"[yellow]⚠ Could not update perf_super: {e}[/yellow]" + ) + results["successful_runs"].append( + { + "model": model_info.get("name"), + "perf_data": aggregated_record, + "nodes": results["nodes"], + "per_node_metrics": per_node_metrics, + } + ) self.console.print( f"[green]✓ Aggregated performance from {len(per_node_metrics)} nodes[/green]" ) @@ -2427,6 +2663,7 @@ def collect_results(self, deployment_id: str) -> Dict[str, Any]: gpu_arch = "N/A" if results.get("logs"): import re + log_content = results["logs"][0].get("log", "") m = re.search(r"(?:🔹\s*)?Name\s*:\s*(gfx\w+)", log_content) if m: @@ -2460,15 +2697,20 @@ def collect_results(self, deployment_id: str) -> Dict[str, Any]: ) # Build successful_runs for display (one entry per CSV row) import csv as _csv + model_name = model_info.get("name", "") - with open(resolved_csv_path, "r", encoding="utf-8", errors="ignore") as f: + with open( + resolved_csv_path, "r", encoding="utf-8", errors="ignore" + ) as f: reader = _csv.DictReader(f) for row in reader: row = {k.strip(): v for k, v in row.items() if k} if row.get("performance") and row.get("metric"): display_model = f"{model_name}_{row.get('model', '')}" record = self._create_multiple_result_row_record( - model_info, build_info, deployment_id, + model_info, + build_info, + deployment_id, { "model": display_model, "performance": row.get("performance"), @@ -2478,12 +2720,22 @@ def collect_results(self, deployment_id: str) -> Dict[str, Any]: }, ) if record: - results["successful_runs"].append({ - "model": display_model, - "perf_data": record, - "nodes": [], - "per_node_metrics": [{"model": display_model, "performance": row.get("performance"), "metric": row.get("metric", "")}], - }) + results["successful_runs"].append( + { + "model": display_model, + "perf_data": record, + "nodes": [], + "per_node_metrics": [ + { + "model": display_model, + "performance": row.get( + "performance" + ), + "metric": row.get("metric", ""), + } + ], + } + ) self.console.print( f"[green]✓ Updated perf.csv, perf_entry.*, perf_super.* (Docker-compatible)[/green]" ) @@ -2499,12 +2751,14 @@ def collect_results(self, deployment_id: str) -> Dict[str, Any]: ) if record: self._write_to_perf_csv(record) - results["successful_runs"].append({ - "model": item["model"], - "perf_data": record, - "nodes": [], - "per_node_metrics": [item], - }) + results["successful_runs"].append( + { + "model": item["model"], + "perf_data": record, + "nodes": [], + "per_node_metrics": [item], + } + ) self.console.print( f"[green]✓ Wrote {len(fallback_metrics)} row(s) from multiple_results to perf.csv[/green]" ) @@ -2515,30 +2769,38 @@ def collect_results(self, deployment_id: str) -> Dict[str, Any]: model_info, build_info, deployment_id, error_msg ) self._write_to_perf_csv(failure_record) - results["failed_runs"].append({ - "model": model_info.get("name", "Unknown"), - "error": error_msg, - "nodes": results["nodes"] - }) + results["failed_runs"].append( + { + "model": model_info.get("name", "Unknown"), + "error": error_msg, + "nodes": results["nodes"], + } + ) self.console.print( f"[yellow]⚠ No performance metrics found, recorded as FAILED[/yellow]" ) - elif resolved_csv_path and not REPORTING_AVAILABLE and not results.get("successful_runs"): + elif ( + resolved_csv_path + and not REPORTING_AVAILABLE + and not results.get("successful_runs") + ): # Legacy path ran but produced no valid rows error_msg = "No performance metrics found from any node" failure_record = self._create_failure_record( model_info, build_info, deployment_id, error_msg ) self._write_to_perf_csv(failure_record) - results["failed_runs"].append({ - "model": model_info.get("name", "Unknown"), - "error": error_msg, - "nodes": results["nodes"] - }) + results["failed_runs"].append( + { + "model": model_info.get("name", "Unknown"), + "error": error_msg, + "nodes": results["nodes"], + } + ) self.console.print( f"[yellow]⚠ No performance metrics found, recorded as FAILED[/yellow]" ) - + # 4. Generate summary self._generate_results_summary(results, results_dir) @@ -2546,7 +2808,7 @@ def collect_results(self, deployment_id: str) -> Dict[str, Any]: self.console.print(f"[yellow]⚠ Results collection incomplete: {e}[/yellow]") return results - + def _collect_artifacts_immediately(self, deployment_id: str, pod_name: str) -> None: """ Collect artifacts immediately from a running pod during the sleep period. @@ -2556,41 +2818,45 @@ def _collect_artifacts_immediately(self, deployment_id: str, pod_name: str) -> N # Create results directory results_dir = Path("k8s_results") / deployment_id results_dir.mkdir(parents=True, exist_ok=True) - + pod_dir = results_dir / pod_name pod_dir.mkdir(exist_ok=True) - + # Collect artifacts artifacts = self._collect_pod_artifacts(pod_name, pod_dir) - + if artifacts: - self.console.print(f"[green]✓ Collected {len(artifacts)} artifacts from {pod_name}[/green]") + self.console.print( + f"[green]✓ Collected {len(artifacts)} artifacts from {pod_name}[/green]" + ) else: - self.console.print(f"[yellow]⚠ No artifacts collected from {pod_name}[/yellow]") - + self.console.print( + f"[yellow]⚠ No artifacts collected from {pod_name}[/yellow]" + ) + except Exception as e: self.console.print(f"[yellow]⚠ Error collecting artifacts: {e}[/yellow]") - + def _collect_pod_artifacts(self, pod_name: str, dest_dir: Path) -> List[Dict]: """ Collect file artifacts from pod using kubectl cp. - + Collects: - perf.csv (performance results) - *_env.csv (environment details from rocEnvTool) - profiling outputs (rocprof*, results*, *.db) - tracing outputs (*_output/ directories) - tool-specific outputs - + Args: pod_name: Name of the Kubernetes pod dest_dir: Local directory to save artifacts - + Returns: List of collected artifact metadata """ artifacts = [] - + # Define artifact patterns to collect artifact_patterns = [ {"pattern": "perf.csv", "type": "performance"}, @@ -2598,55 +2864,69 @@ def _collect_pod_artifacts(self, pod_name: str, dest_dir: Path) -> List[Dict]: {"pattern": "results*", "type": "profiling"}, {"pattern": "*.db", "type": "profiling"}, {"pattern": "trace.*", "type": "tracing"}, - {"pattern": "prof.csv", "type": "profiling"}, # Raw profiler output before post-script renames it + { + "pattern": "prof.csv", + "type": "profiling", + }, # Raw profiler output before post-script renames it {"pattern": "gpu_info_*.csv", "type": "profiling"}, {"pattern": "library_trace.csv", "type": "tracing"}, ] - + for artifact_def in artifact_patterns: pattern = artifact_def["pattern"] artifact_type = artifact_def["type"] - + try: # Try direct kubectl cp without exec (works during the sleep period) # For patterns with wildcards, try common specific filenames - if '*' in pattern: + if "*" in pattern: # Expand pattern to specific known files if pattern == "*_env.csv": - specific_files = ["dummy_prof_env.csv", "dummy_data_minio_env.csv"] + specific_files = [ + "dummy_prof_env.csv", + "dummy_data_minio_env.csv", + ] elif pattern == "gpu_info_*.csv": - specific_files = ["gpu_info_power_profiler_output.csv", "gpu_info_vram_profiler_output.csv"] + specific_files = [ + "gpu_info_power_profiler_output.csv", + "gpu_info_vram_profiler_output.csv", + ] elif pattern == "results*": specific_files = ["results.csv", "results.txt", "results.json"] elif pattern == "trace.*": specific_files = ["trace.txt", "trace.csv", "trace.json"] else: specific_files = [] - + for filename in specific_files: local_path = dest_dir / filename cp_cmd = [ - "kubectl", "cp", + "kubectl", + "cp", f"{self.namespace}/{pod_name}:/workspace/{filename}", - str(local_path) + str(local_path), ] - + cp_result = subprocess.run( cp_cmd, capture_output=True, text=True, timeout=30 ) - + if cp_result.returncode == 0 and local_path.exists(): - artifacts.append({ - "pod": pod_name, - "type": artifact_type, - "source": f"/workspace/{filename}", - "local_path": str(local_path), - "size": local_path.stat().st_size - }) + artifacts.append( + { + "pod": pod_name, + "type": artifact_type, + "source": f"/workspace/{filename}", + "local_path": str(local_path), + "size": local_path.stat().st_size, + } + ) self.console.print( f"[dim] ✓ Collected {artifact_type}: {filename}[/dim]" ) - elif cp_result.stderr and "No such file" not in cp_result.stderr: + elif ( + cp_result.stderr and "No such file" not in cp_result.stderr + ): # Log unexpected errors (but not "file not found") self.console.print( f"[yellow] ⚠ Failed to collect {filename}: {cp_result.stderr.strip()}[/yellow]" @@ -2655,23 +2935,26 @@ def _collect_pod_artifacts(self, pod_name: str, dest_dir: Path) -> List[Dict]: # Direct file - try to copy it local_path = dest_dir / pattern cp_cmd = [ - "kubectl", "cp", + "kubectl", + "cp", f"{self.namespace}/{pod_name}:/workspace/{pattern}", - str(local_path) + str(local_path), ] - + cp_result = subprocess.run( cp_cmd, capture_output=True, text=True, timeout=30 ) - + if cp_result.returncode == 0 and local_path.exists(): - artifacts.append({ - "pod": pod_name, - "type": artifact_type, - "source": f"/workspace/{pattern}", - "local_path": str(local_path), - "size": local_path.stat().st_size - }) + artifacts.append( + { + "pod": pod_name, + "type": artifact_type, + "source": f"/workspace/{pattern}", + "local_path": str(local_path), + "size": local_path.stat().st_size, + } + ) self.console.print( f"[dim] ✓ Collected {artifact_type}: {pattern}[/dim]" ) @@ -2680,48 +2963,55 @@ def _collect_pod_artifacts(self, pod_name: str, dest_dir: Path) -> List[Dict]: self.console.print( f"[yellow] ⚠ Failed to collect {pattern}: {cp_result.stderr.strip()}[/yellow]" ) - + except subprocess.TimeoutExpired: pass # Timeout - skip this file except Exception: pass # File not found or not accessible - this is expected - + # Try to collect known output directories using kubectl cp directly (during sleep period) output_directories = ["rocprof_output", "rpd_output", "trace_output"] for dir_name in output_directories: try: local_dir = dest_dir / dir_name cp_cmd = [ - "kubectl", "cp", + "kubectl", + "cp", f"{self.namespace}/{pod_name}:/workspace/{dir_name}", - str(local_dir) + str(local_dir), ] - + cp_result = subprocess.run( cp_cmd, capture_output=True, text=True, timeout=60 ) - + if cp_result.returncode == 0 and local_dir.exists(): # Count files in directory - file_count = sum(1 for _ in local_dir.rglob('*') if _.is_file()) + file_count = sum(1 for _ in local_dir.rglob("*") if _.is_file()) if file_count > 0: - total_size = sum(f.stat().st_size for f in local_dir.rglob('*') if f.is_file()) - artifacts.append({ - "pod": pod_name, - "type": "tool_output_directory", - "source": f"/workspace/{dir_name}", - "local_path": str(local_dir), - "file_count": file_count, - "size": total_size - }) + total_size = sum( + f.stat().st_size + for f in local_dir.rglob("*") + if f.is_file() + ) + artifacts.append( + { + "pod": pod_name, + "type": "tool_output_directory", + "source": f"/workspace/{dir_name}", + "local_path": str(local_dir), + "file_count": file_count, + "size": total_size, + } + ) self.console.print( f"[dim] ✓ Collected directory: {dir_name} ({file_count} files, {total_size} bytes)[/dim]" ) except Exception: pass # Directory not found - this is expected - + return artifacts - + def _collect_from_pvc( self, deployment_id: str, @@ -2748,22 +3038,31 @@ def _collect_from_pvc( pod_names: Full Kubernetes pod names for this job (ordered) """ pvc_name = f"{deployment_id}-results" - + try: # Create a temporary pod to access PVC collector_pod_name = f"collector-{deployment_id[:15]}" - - self.console.print(f"[dim]📦 Collecting artifacts from PVC: {pvc_name}[/dim]") - + + self.console.print( + f"[dim]📦 Collecting artifacts from PVC: {pvc_name}[/dim]" + ) + collector_spec: Dict[str, Any] = { "restartPolicy": "Never", - "containers": [{ - "name": "collector", - "image": "busybox:latest", - "command": ["sh", "-c", "sleep 600"], - "volumeMounts": [{"name": "results", "mountPath": "/results"}] - }], - "volumes": [{"name": "results", "persistentVolumeClaim": {"claimName": pvc_name}}] + "containers": [ + { + "name": "collector", + "image": "busybox:latest", + "command": ["sh", "-c", "sleep 600"], + "volumeMounts": [{"name": "results", "mountPath": "/results"}], + } + ], + "volumes": [ + { + "name": "results", + "persistentVolumeClaim": {"claimName": pvc_name}, + } + ], } ips = getattr(self, "_image_pull_secrets_for_pods", None) or [] if ips: @@ -2785,10 +3084,10 @@ def _collect_from_pvc( except ApiException as e: if e.status != 404: # 404 means pod doesn't exist, which is fine pass - + # Create collector pod self.core_v1.create_namespaced_pod(self.namespace, collector_pod_spec) - + # Wait for pod to be ready for _ in range(30): # Wait up to 30 seconds try: @@ -2800,7 +3099,9 @@ def _collect_from_pvc( except ApiException as e: # Pod not found yet or not ready - this is expected during startup if e.status != 404: - self.console.print(f"[dim]Waiting for collector pod (status: {e.status})...[/dim]") + self.console.print( + f"[dim]Waiting for collector pod (status: {e.status})...[/dim]" + ) time.sleep(1) else: raise Exception("Collector pod did not start in time") @@ -2874,11 +3175,15 @@ def _collect_from_pvc( str(local_pod_dir), ] - cp_result = subprocess.run(cp_cmd, capture_output=True, text=True, timeout=60) + cp_result = subprocess.run( + cp_cmd, capture_output=True, text=True, timeout=60 + ) if cp_result.returncode == 0: # Count collected files - file_count = sum(1 for _ in local_pod_dir.rglob('*') if _.is_file()) + file_count = sum( + 1 for _ in local_pod_dir.rglob("*") if _.is_file() + ) if file_count > 0: art: Dict[str, Any] = { "source": f"PVC:{pvc_name}/{pod_dir_name}", @@ -2899,7 +3204,7 @@ def _collect_from_pvc( self.console.print( f"[dim] ✓ Collected {file_count} files from {pod_dir_name} → {dest_hint}[/dim]" ) - + self.console.print(f"[green]✓ Collected artifacts from PVC[/green]") else: hint = "" @@ -2916,19 +3221,19 @@ def _collect_from_pvc( self.console.print( f"[yellow]⚠ No results found in PVC after retries{hint}[/yellow]" ) - + # Cleanup collector pod self.core_v1.delete_namespaced_pod( collector_pod_name, self.namespace, grace_period_seconds=0 ) - + except Exception as e: self.console.print(f"[yellow]⚠ Could not collect from PVC: {e}[/yellow]") - + def _generate_results_summary(self, results: Dict, results_dir: Path): """ Generate a summary JSON of all collected artifacts. - + Args: results: Results dict with logs and artifacts results_dir: Directory where results are saved @@ -2950,44 +3255,49 @@ def _generate_results_summary(self, results: Dict, results_dir: Path): "successful_runs": len(results["successful_runs"]), "failed_runs": len(results["failed_runs"]), } - + # Group artifacts by type for artifact in results["artifacts"]: artifact_type = artifact.get("type", "unknown") - summary["artifacts_by_type"][artifact_type] = summary["artifacts_by_type"].get(artifact_type, 0) + 1 - + summary["artifacts_by_type"][artifact_type] = ( + summary["artifacts_by_type"].get(artifact_type, 0) + 1 + ) + summary_file = results_dir / "results_summary.json" summary_file.write_text(json.dumps(summary, indent=2)) - + self.console.print(f"[green]✓ Results summary: {summary_file}[/green]") - + # Print summary table if artifacts were collected if summary["artifacts_by_type"]: from rich.table import Table + table = Table(title="Collected Artifacts") table.add_column("Type", style="cyan") table.add_column("Count", justify="right", style="green") - + for artifact_type, count in sorted(summary["artifacts_by_type"].items()): table.add_row(artifact_type, str(count)) - + self.console.print(table) - - def _create_failure_record(self, model_info: Dict, build_info: Dict, pod_name: str, error_msg: str) -> Dict: + + def _create_failure_record( + self, model_info: Dict, build_info: Dict, pod_name: str, error_msg: str + ) -> Dict: """ Create a failure record for perf.csv when performance metrics are missing. - + Args: model_info: Model information from manifest build_info: Build information from manifest pod_name: Kubernetes pod name error_msg: Error message describing the failure - + Returns: Dict with all perf.csv fields marked as FAILED """ import os - + # Get topology information for failure record deployment_config = self.manifest.get("deployment_config", {}) distributed_config = deployment_config.get("distributed", {}) @@ -2997,7 +3307,7 @@ def _create_failure_record(self, model_info: Dict, build_info: Dict, pod_name: s nproc_per_node = int(model_info.get("n_gpus", 1)) # Launcher: use distributed.launcher when set, otherwise "native" for k8s launcher = normalize_launcher(distributed_config.get("launcher"), "kubernetes") - + # Create a record with the same structure as successful runs # but with performance=0, metric="", and status="FAILED" result = { @@ -3006,45 +3316,40 @@ def _create_failure_record(self, model_info: Dict, build_info: Dict, pod_name: s "n_gpus": str(nnodes * nproc_per_node), "nnodes": str(nnodes), "gpus_per_node": str(nproc_per_node), - # Model configuration "training_precision": model_info.get("training_precision", ""), "pipeline": get_pipeline(), "args": model_info.get("args", ""), "tags": model_info.get("tags", ""), - # Build information "docker_file": build_info.get("dockerfile", ""), "base_docker": build_info.get("base_docker", ""), "docker_sha": build_info.get("docker_sha", ""), "docker_image": build_info.get("docker_image", ""), - # Runtime information "git_commit": "", "machine_name": pod_name, "deployment_type": "kubernetes", "launcher": launcher, "gpu_architecture": "", - # Performance metrics - FAILED "performance": "0", "metric": error_msg, # Store error message in metric field "relative_change": "", "status": "FAILURE", # Use "FAILURE" to match CSV schema - # Timing "build_duration": build_info.get("build_duration", ""), "test_duration": "", - # Data information "dataname": model_info.get("data", ""), "data_provider_type": "", "data_size": "", "data_download_duration": "", - # Build tracking "build_number": get_build_number(), - "additional_docker_run_options": model_info.get("additional_docker_run_options", ""), + "additional_docker_run_options": model_info.get( + "additional_docker_run_options", "" + ), } flatten_tags_in_place(result) return result @@ -3082,12 +3387,16 @@ def _build_perf_entry_from_aggregated( if nproc_per_node is None: nproc_per_node = int(model_info.get("n_gpus", 1)) launcher = normalize_launcher(distributed_config.get("launcher"), "kubernetes") - test_duration = aggregated_record.get("test_duration") or aggregated_record.get("duration", "") + test_duration = aggregated_record.get("test_duration") or aggregated_record.get( + "duration", "" + ) run_details = { "model": model_info.get("name", aggregated_record.get("model", "")), "n_gpus": str(aggregated_record.get("n_gpus", nnodes * nproc_per_node)), "nnodes": str(aggregated_record.get("nnodes", nnodes)), - "gpus_per_node": str(aggregated_record.get("gpus_per_node", nproc_per_node)), + "gpus_per_node": str( + aggregated_record.get("gpus_per_node", nproc_per_node) + ), "training_precision": model_info.get("training_precision", ""), "pipeline": get_pipeline(), "args": model_info.get("args", ""), @@ -3112,7 +3421,9 @@ def _build_perf_entry_from_aggregated( "data_size": "", "data_download_duration": "", "build_number": get_build_number(), - "additional_docker_run_options": model_info.get("additional_docker_run_options", ""), + "additional_docker_run_options": model_info.get( + "additional_docker_run_options", "" + ), } flatten_tags_in_place(run_details) try: @@ -3174,7 +3485,9 @@ def _build_common_info_dict( "data_size": "", "data_download_duration": "", "build_number": get_build_number(), - "additional_docker_run_options": model_info.get("additional_docker_run_options", ""), + "additional_docker_run_options": model_info.get( + "additional_docker_run_options", "" + ), } flatten_tags_in_place(result) return result @@ -3191,14 +3504,14 @@ def _create_multiple_result_row_record( Same shape as _create_failure_record but with SUCCESS and item's performance/metric/model. """ import os - + deployment_config = self.manifest.get("deployment_config", {}) distributed_config = deployment_config.get("distributed", {}) nnodes = distributed_config.get("nnodes", 1) nproc_per_node = distributed_config.get("nproc_per_node") if nproc_per_node is None: nproc_per_node = int(model_info.get("n_gpus", 1)) - + # Launcher: use distributed.launcher when set, otherwise "native" for k8s launcher = normalize_launcher(distributed_config.get("launcher"), "kubernetes") result = { @@ -3230,11 +3543,13 @@ def _create_multiple_result_row_record( "data_size": "", "data_download_duration": "", "build_number": get_build_number(), - "additional_docker_run_options": model_info.get("additional_docker_run_options", ""), + "additional_docker_run_options": model_info.get( + "additional_docker_run_options", "" + ), } flatten_tags_in_place(result) return result - + def _parse_multiple_results_from_artifacts( self, results_dir: Path, @@ -3246,17 +3561,19 @@ def _parse_multiple_results_from_artifacts( Parse performance from a multiple_results CSV (e.g. perf_dummy.csv) collected from PVC. Used when the model only writes CSV and does not print 'performance: X Y' to the log (same contract as local container_runner multiple_results handling). - + Returns: List of perf_data dicts (same shape as _parse_node_performance), or empty list. """ import csv as csv_module + multiple_results_file = model_info.get("multiple_results") filename = Path(multiple_results_file).name if multiple_results_file else None # Try to get gpu_architecture from first pod log gpu_arch = "N/A" if results.get("logs"): import re + log_content = results["logs"][0].get("log", "") gpu_arch_match = re.search(r"(?:🔹\s*)?Name\s*:\s*(gfx\w+)", log_content) if gpu_arch_match: @@ -3279,7 +3596,11 @@ def _parse_multiple_results_from_artifacts( with open(csv_path, "r", encoding="utf-8", errors="ignore") as f: reader = csv_module.DictReader(f) reader.fieldnames = [f.strip() for f in (reader.fieldnames or [])] - if not reader.fieldnames or "performance" not in reader.fieldnames or "metric" not in reader.fieldnames: + if ( + not reader.fieldnames + or "performance" not in reader.fieldnames + or "metric" not in reader.fieldnames + ): continue for row_idx, row in enumerate(reader): perf_val = row.get("performance", "").strip() @@ -3293,17 +3614,19 @@ def _parse_multiple_results_from_artifacts( # Same model naming as local handle_multiple_results: model_name + "_" + str(model) row_model = row.get("model", row_idx) display_model = f"{model_info.get('name')}_{row_model}" - parsed_list.append({ - "model": display_model, - "performance": perf_float, - "metric": metric_val, - "node_id": row_idx, - "local_gpus": 1, - "duration": "N/A", - "gpu_architecture": gpu_arch, - "data_name": "N/A", - "data_provider": "N/A", - }) + parsed_list.append( + { + "model": display_model, + "performance": perf_float, + "metric": metric_val, + "node_id": row_idx, + "local_gpus": 1, + "duration": "N/A", + "gpu_architecture": gpu_arch, + "data_name": "N/A", + "data_provider": "N/A", + } + ) if parsed_list: self.console.print( f"[green] ✓ Parsed performance from {csv_path.name} ({len(parsed_list)} row(s))[/green]" @@ -3323,21 +3646,43 @@ def _aggregation_for_extra_column(self, column_name: str) -> str: """ col = column_name.lower().strip() # Sum: counts, totals, throughput-like - if any(k in col for k in [ - "count", "total", "samples", "tokens", "throughput", - "requests", "images", "bandwidth", "ops" - ]): + if any( + k in col + for k in [ + "count", + "total", + "samples", + "tokens", + "throughput", + "requests", + "images", + "bandwidth", + "ops", + ] + ): return "sum" # Average: rates per unit, utilization, ratios - if any(k in col for k in [ - "utilization", "usage", "percent", "ratio", "latency", - "time_ms", "ttft", "tpot", "accuracy", "loss" - ]): + if any( + k in col + for k in [ + "utilization", + "usage", + "percent", + "ratio", + "latency", + "time_ms", + "ttft", + "tpot", + "accuracy", + "loss", + ] + ): return "average" # Max: duration (slowest node), memory, capacity - if any(k in col for k in [ - "duration", "time", "seconds", "memory", "bytes", "mb", "gb" - ]): + if any( + k in col + for k in ["duration", "time", "seconds", "memory", "bytes", "mb", "gb"] + ): return "max" return "first" @@ -3420,7 +3765,11 @@ def _merge_multi_node_multiple_results_csv( continue values = [r.get(col) for r in group] try: - nums = [float(str(v).strip()) for v in values if v is not None and str(v).strip()] + nums = [ + float(str(v).strip()) + for v in values + if v is not None and str(v).strip() + ] except (ValueError, TypeError): nums = [] if nums: @@ -3441,7 +3790,9 @@ def _merge_multi_node_multiple_results_csv( return False output_path.parent.mkdir(parents=True, exist_ok=True) with open(output_path, "w", newline="", encoding="utf-8") as f: - writer = csv_module.DictWriter(f, fieldnames=all_columns, extrasaction="ignore") + writer = csv_module.DictWriter( + f, fieldnames=all_columns, extrasaction="ignore" + ) writer.writeheader() writer.writerows(merged_rows) self.console.print( @@ -3495,7 +3846,9 @@ def cleanup(self, deployment_id: str) -> bool: self.console.print(f"[yellow]Deleted K8s Job: {deployment_id}[/yellow]") except ApiException as e: if e.status != 404: - self.console.print(f"[yellow]⚠ Job cleanup warning: {e.reason}[/yellow]") + self.console.print( + f"[yellow]⚠ Job cleanup warning: {e.reason}[/yellow]" + ) success = False except Exception as e: self.console.print(f"[yellow]⚠ Job cleanup error: {e}[/yellow]") @@ -3507,9 +3860,7 @@ def cleanup(self, deployment_id: str) -> bool: self.core_v1.delete_namespaced_config_map( name=configmap_name, namespace=self.namespace ) - self.console.print( - f"[yellow]Deleted ConfigMap: {configmap_name}[/yellow]" - ) + self.console.print(f"[yellow]Deleted ConfigMap: {configmap_name}[/yellow]") except ApiException as e: if e.status != 404: self.console.print( @@ -3532,4 +3883,3 @@ def cleanup(self, deployment_id: str) -> bool: pass return success - diff --git a/src/madengine/deployment/kubernetes_launcher_mixin.py b/src/madengine/deployment/kubernetes_launcher_mixin.py index e875f6d2..2b4d517e 100644 --- a/src/madengine/deployment/kubernetes_launcher_mixin.py +++ b/src/madengine/deployment/kubernetes_launcher_mixin.py @@ -28,28 +28,28 @@ def _generate_torchrun_command( ) -> str: """ Generate torchrun launcher command for K8s Indexed Jobs. - + For single-node (nnodes=1), generates standalone torchrun command. For multi-node (nnodes>1), generates distributed torchrun with headless service DNS for coordination. - + Uses K8s environment variables for distributed coordination: - JOB_COMPLETION_INDEX: Pod index (0, 1, 2, ...) - Headless service DNS for MASTER_ADDR - + CRITICAL FIX: For bash scripts that use ${BASH_SOURCE[0]}, we cd into the script directory first so relative paths resolve correctly. This fixes the issue where profiling tool wrappers prevent BASH_SOURCE from resolving. - + Args: nnodes: Number of nodes (pods). Must be >= 1. nproc_per_node: GPUs per node. Must be >= 1. master_port: Master communication port. Must be 1-65535. model_script: Path to model's run script. Cannot be empty. - + Returns: Complete torchrun command string - + Raises: ValueError: If any parameter is invalid """ @@ -57,15 +57,19 @@ def _generate_torchrun_command( if not isinstance(nnodes, int) or nnodes < 1: raise ValueError(f"nnodes must be integer >= 1, got {nnodes}") if not isinstance(nproc_per_node, int) or nproc_per_node < 1: - raise ValueError(f"nproc_per_node must be integer >= 1, got {nproc_per_node}") + raise ValueError( + f"nproc_per_node must be integer >= 1, got {nproc_per_node}" + ) if not isinstance(master_port, int) or not (1 <= master_port <= 65535): raise ValueError(f"master_port must be 1-65535, got {master_port}") if not model_script or not isinstance(model_script, str): - raise ValueError(f"model_script must be non-empty string, got {model_script}") - + raise ValueError( + f"model_script must be non-empty string, got {model_script}" + ) + # Check if model_script is a bash script # If so, execute it directly as it handles torchrun internally - if model_script.endswith('.sh'): + if model_script.endswith(".sh"): # For bash scripts, set environment variables and execute script # The script itself will invoke torchrun with the appropriate Python file # CRITICAL: cd to script directory first so BASH_SOURCE[0] resolves correctly @@ -82,7 +86,7 @@ def _generate_torchrun_command( export MAD_MULTI_NODE_RUNNER="torchrun --nnodes={nnodes} --nproc_per_node={nproc_per_node} --node_rank=${{JOB_COMPLETION_INDEX}} --master_addr=${{MASTER_ADDR}} --master_port={master_port}" export MAD_RUNTIME_NGPUS={nproc_per_node} cd {script_dir} && bash {script_name}""" - + # For Python scripts, invoke torchrun directly # For single-node, simpler standalone command if nnodes == 1: @@ -91,7 +95,7 @@ def _generate_torchrun_command( --nnodes=1 \\ --nproc_per_node={nproc_per_node} \\ {model_script}""" - + # Multi-node: Use headless service DNS and JOB_COMPLETION_INDEX return f"""# Multi-node torchrun setup (Kubernetes Indexed Job) export MASTER_ADDR="{self.job_name}-0.{self._k8s_headless_subdomain_label}.{self.namespace}.svc.cluster.local" @@ -118,32 +122,32 @@ def _generate_torchrun_command( --role=worker \\ --tee=3 \\ {model_script}""" - + def _generate_deepspeed_command( self, nnodes: int, nproc_per_node: int, master_port: int, model_script: str ) -> str: """ Generate DeepSpeed launcher command for K8s Indexed Jobs. - + DeepSpeed has its own launcher that handles: - ZeRO optimization stages (ZeRO-1, ZeRO-2, ZeRO-3) - Gradient accumulation - Mixed precision training - Pipeline parallelism - Hostfile management (handled by K8s in our case) - + For single-node (nnodes=1), uses localhost setup. For multi-node (nnodes>1), uses headless service DNS for coordination. - + Args: nnodes: Number of nodes (pods). Must be >= 1. nproc_per_node: GPUs per node. Must be >= 1. master_port: Master communication port. Must be 1-65535. model_script: Path to model's run script. Cannot be empty. - + Returns: Complete DeepSpeed launcher command string - + Raises: ValueError: If any parameter is invalid """ @@ -151,12 +155,16 @@ def _generate_deepspeed_command( if not isinstance(nnodes, int) or nnodes < 1: raise ValueError(f"nnodes must be integer >= 1, got {nnodes}") if not isinstance(nproc_per_node, int) or nproc_per_node < 1: - raise ValueError(f"nproc_per_node must be integer >= 1, got {nproc_per_node}") + raise ValueError( + f"nproc_per_node must be integer >= 1, got {nproc_per_node}" + ) if not isinstance(master_port, int) or not (1 <= master_port <= 65535): raise ValueError(f"master_port must be 1-65535, got {master_port}") if not model_script or not isinstance(model_script, str): - raise ValueError(f"model_script must be non-empty string, got {model_script}") - + raise ValueError( + f"model_script must be non-empty string, got {model_script}" + ) + # For single-node if nnodes == 1: return f"""# DeepSpeed Single-Node Setup @@ -176,7 +184,7 @@ def _generate_deepspeed_command( deepspeed --num_gpus={nproc_per_node} \\ --master_port={master_port} \\ {model_script}""" - + # Multi-node: Use K8s headless service for coordination return f"""# Multi-node DeepSpeed setup (Kubernetes Indexed Job) export MASTER_ADDR="{self.job_name}-0.{self._k8s_headless_subdomain_label}.{self.namespace}.svc.cluster.local" @@ -217,25 +225,25 @@ def _generate_deepspeed_command( --num_nodes={nnodes} \\ --num_gpus={nproc_per_node} \\ {model_script}""" - + def _generate_bash_script_command( self, nnodes: int, nproc_per_node: int, master_port: int, model_script: str ) -> str: """ Generate command to execute a bash script directly. - + This is used when the model script is a .sh file that handles launcher invocation internally (e.g., using torchrun inside the script). - + Sets up environment variables for distributed training that the bash script can use. - + Args: nnodes: Number of nodes (pods) nproc_per_node: GPUs per node master_port: Master communication port model_script: Path to the bash script - + Returns: Command to execute the bash script with environment setup """ @@ -262,7 +270,7 @@ def _generate_bash_script_command( # Execute the bash script directly bash {model_script}""" - + # Multi-node: Use K8s headless service for coordination return f"""# Bash Script Execution (Multi-Node) # Setting up environment for script to use @@ -286,47 +294,47 @@ def _generate_bash_script_command( # Execute the bash script directly bash {model_script}""" - + def _generate_torchtitan_command( self, nnodes: int, nproc_per_node: int, master_port: int, model_script: str ) -> str: """ Generate TorchTitan launcher command for K8s Indexed Jobs. - + TorchTitan is a PyTorch native platform for large-scale LLM pre-training that supports multi-dimensional parallelism: - FSDP2 (Fully Sharded Data Parallel v2) - Tensor Parallel (TP) - Pipeline Parallel (PP) - Context Parallel (CP) - + TorchTitan uses torchrun as its underlying distributed launcher but requires additional configuration for its parallelism strategies. - + For single-node (nnodes=1): Uses standalone torchrun with TP For multi-node (nnodes>1): Uses distributed torchrun with TP+PP+FSDP2 - + Uses K8s environment variables for distributed coordination: - JOB_COMPLETION_INDEX: Pod index (0, 1, 2, ...) - Headless service DNS for MASTER_ADDR - + Args: nnodes: Number of nodes (pods). Must be >= 1. nproc_per_node: GPUs per node. Must be >= 1. master_port: Master communication port. Must be 1-65535. model_script: Path to model's run script. Cannot be empty. - + Returns: Complete torchtitan launch command string with environment setup - + Raises: ValueError: If any parameter is invalid - + Example single-node output: export TORCHTITAN_TENSOR_PARALLEL_SIZE=8 export TORCHTITAN_PIPELINE_PARALLEL_SIZE=1 torchrun --standalone --nproc_per_node=8 train.py --config llama3_8b.toml - + Example multi-node output: export MASTER_ADDR="job-0.job.namespace.svc.cluster.local" export TORCHTITAN_TENSOR_PARALLEL_SIZE=8 @@ -338,12 +346,16 @@ def _generate_torchtitan_command( if not isinstance(nnodes, int) or nnodes < 1: raise ValueError(f"nnodes must be integer >= 1, got {nnodes}") if not isinstance(nproc_per_node, int) or nproc_per_node < 1: - raise ValueError(f"nproc_per_node must be integer >= 1, got {nproc_per_node}") + raise ValueError( + f"nproc_per_node must be integer >= 1, got {nproc_per_node}" + ) if not isinstance(master_port, int) or not (1 <= master_port <= 65535): raise ValueError(f"master_port must be 1-65535, got {master_port}") if not model_script or not isinstance(model_script, str): - raise ValueError(f"model_script must be non-empty string, got {model_script}") - + raise ValueError( + f"model_script must be non-empty string, got {model_script}" + ) + # For single-node, use standalone mode with Tensor Parallelism only if nnodes == 1: return f"""# TorchTitan single-node setup (Tensor Parallelism) @@ -362,7 +374,7 @@ def _generate_torchtitan_command( --nnodes=1 \\ --nproc_per_node={nproc_per_node} \\ {model_script}""" - + # Multi-node: Use headless service DNS and enable all parallelism strategies return f"""# TorchTitan multi-node setup (K8s Indexed Job) export MASTER_ADDR="{self.job_name}-0.{self._k8s_headless_subdomain_label}.{self.namespace}.svc.cluster.local" @@ -399,34 +411,34 @@ def _generate_torchtitan_command( --role=worker \\ --tee=3 \\ {model_script}""" - + def _generate_sglang_disagg_command( self, nnodes: int, nproc_per_node: int, master_port: int, model_script: str ) -> str: """ Generate SGLang Disaggregated launcher command for K8s Indexed Jobs. - + SGLang Disaggregated uses separate node pools for: - Proxy (index 0): Load balancer and request router - Prefill (indices 1 to xP): Prompt processing - Decode (indices xP+1 to end): Token generation - + Communication via Mooncake framework for efficient KV cache transfer. - + Architecture: - Pod 0: Runs mini_lb (proxy/load balancer) - Pods 1-xP: Run prefill servers - Pods xP+1 to N-1: Run decode servers - + Args: nnodes: Total number of pods (must be >= 3) nproc_per_node: GPUs per pod master_port: Port for proxy service model_script: Path to model launch script - + Returns: Complete disaggregated launch setup - + Raises: ValueError: If nnodes < 3 or invalid parameters """ @@ -439,12 +451,14 @@ def _generate_sglang_disagg_command( raise ValueError(f"nproc_per_node must be >= 1, got {nproc_per_node}") if not model_script or not isinstance(model_script, str): raise ValueError(f"model_script must be non-empty string") - + # Check if custom split is specified in additional_context - sglang_disagg_config = self.config.additional_context.get("distributed", {}).get("sglang_disagg", {}) + sglang_disagg_config = self.config.additional_context.get( + "distributed", {} + ).get("sglang_disagg", {}) prefill_nodes = sglang_disagg_config.get("prefill_nodes") decode_nodes = sglang_disagg_config.get("decode_nodes") - + if prefill_nodes is not None and decode_nodes is not None: # User specified custom split - validate if prefill_nodes < 1 or decode_nodes < 1: @@ -464,18 +478,22 @@ def _generate_sglang_disagg_command( # Default automatic split (can be customized via additional_context) xP = max(1, (nnodes - 1) * 2 // 5) # ~40% prefill yD = nnodes - 1 - xP # remaining decode - + # Build prefill and decode server lists - prefill_servers = " ".join([ - f"http://{self.job_name}-{i}.{self._k8s_headless_subdomain_label}.{self.namespace}.svc.cluster.local:30000" - for i in range(1, xP + 1) - ]) - - decode_servers = " ".join([ - f"http://{self.job_name}-{i}.{self._k8s_headless_subdomain_label}.{self.namespace}.svc.cluster.local:30000" - for i in range(xP + 1, nnodes) - ]) - + prefill_servers = " ".join( + [ + f"http://{self.job_name}-{i}.{self._k8s_headless_subdomain_label}.{self.namespace}.svc.cluster.local:30000" + for i in range(1, xP + 1) + ] + ) + + decode_servers = " ".join( + [ + f"http://{self.job_name}-{i}.{self._k8s_headless_subdomain_label}.{self.namespace}.svc.cluster.local:30000" + for i in range(xP + 1, nnodes) + ] + ) + return f"""# SGLang Disaggregated K8s Setup # ============================================ # Cluster: {nnodes} pods total @@ -508,17 +526,17 @@ def _generate_sglang_disagg_command( if [ "$POD_INDEX" -eq 0 ]; then # Proxy Node (Load Balancer) echo "🔀 This pod is PROXY (Load Balancer)" - + python3 -m sglang.srt.disaggregation.mini_lb \\ --prefill {prefill_servers} \\ --decode {decode_servers} \\ --host 0.0.0.0 \\ --port {master_port} - + elif [ "$POD_INDEX" -le "{xP}" ]; then # Prefill Nodes echo "⚡ This pod is PREFILL Node" - + python3 -m sglang.launch_server \\ --model-path "$MODEL_PATH" \\ --disaggregation-mode prefill \\ @@ -527,11 +545,11 @@ def _generate_sglang_disagg_command( --port 30000 \\ --trust-remote-code \\ --disaggregation-transfer-backend mooncake - + else # Decode Nodes echo "🔤 This pod is DECODE Node" - + python3 -m sglang.launch_server \\ --model-path "$MODEL_PATH" \\ --disaggregation-mode decode \\ @@ -544,7 +562,7 @@ def _generate_sglang_disagg_command( echo "SGLang Disaggregated setup complete" """ - + def _generate_vllm_command( self, nnodes: int, @@ -555,32 +573,32 @@ def _generate_vllm_command( ) -> str: """ Generate vLLM launcher command for K8s Indexed Jobs. - + vLLM is an inference engine with its own process management via Ray. Unlike training frameworks, vLLM doesn't use torchrun. - + Architecture: - Single-node: Tensor Parallelism (TP) across GPUs, no Ray needed - Multi-node: Data Parallelism where each node runs independent vLLM replica * Each replica uses TP across its local GPUs * Ray coordinates resources on each node independently * Benefits: Simpler, more robust, better for inference serving - + For K8s multi-node: - Each pod runs its own independent vLLM instance - Uses Ray for local GPU coordination - NO shared Ray cluster across pods (Data Parallelism mode) - + Args: nnodes: Number of nodes (pods). Must be >= 1. nproc_per_node: GPUs per node. Must be >= 1. master_port: Master communication port (for Ray). Must be 1-65535. model_script: Path to model's run script. Cannot be empty. model_args: CLI args for the script (e.g. --model_repo openai/gpt-oss-20b). - + Returns: Complete vLLM launch setup with environment configuration - + Raises: ValueError: If any parameter is invalid """ @@ -588,17 +606,23 @@ def _generate_vllm_command( if not isinstance(nnodes, int) or nnodes < 1: raise ValueError(f"nnodes must be integer >= 1, got {nnodes}") if not isinstance(nproc_per_node, int) or nproc_per_node < 1: - raise ValueError(f"nproc_per_node must be integer >= 1, got {nproc_per_node}") + raise ValueError( + f"nproc_per_node must be integer >= 1, got {nproc_per_node}" + ) if not isinstance(master_port, int) or not (1 <= master_port <= 65535): raise ValueError(f"master_port must be 1-65535, got {master_port}") if not model_script or not isinstance(model_script, str): - raise ValueError(f"model_script must be non-empty string, got {model_script}") + raise ValueError( + f"model_script must be non-empty string, got {model_script}" + ) # Run script from its directory so relative paths (run_vllm.py, configs/) resolve script_dir = str(Path(model_script).parent) script_name = Path(model_script).name - run_cmd = f"cd /workspace/{script_dir} && bash {script_name} {model_args}".strip() - + run_cmd = ( + f"cd /workspace/{script_dir} && bash {script_name} {model_args}".strip() + ) + # For single-node, simple TP setup (no Ray needed) if nnodes == 1: return f"""# vLLM single-node setup (Tensor Parallelism) @@ -617,7 +641,7 @@ def _generate_vllm_command( # vLLM handles process management - run script from its directory so run_vllm.py/configs resolve {run_cmd}""" - + # Multi-node: Data Parallelism with independent Ray clusters per pod return f"""# vLLM multi-node setup (K8s Data Parallelism Mode) export MASTER_ADDR="{self.job_name}-0.{self._k8s_headless_subdomain_label}.{self.namespace}.svc.cluster.local" @@ -677,31 +701,31 @@ def _generate_sglang_command( ) -> str: """ Generate SGLang launcher command for K8s Indexed Jobs. - + SGLang is an inference engine with native launcher (sglang.launch_server). Similar to vLLM, it manages its own process spawning via Ray. - + Architecture: - Single-node: Tensor Parallelism (TP) across GPUs - Multi-node: Uses SGLang's native multi-node launcher with Ray * TP across GPUs within each node * Ray for distributed coordination - + For K8s: - Uses headless service for node discovery (similar to torchrun) - Each pod knows its rank via JOB_COMPLETION_INDEX - SGLang native launcher handles Ray cluster setup - + Args: nnodes: Number of nodes (pods). Must be >= 1. nproc_per_node: GPUs per node. Must be >= 1. master_port: Master communication port (for NCCL/Ray). Must be 1-65535. model_script: Path to model's run script. Cannot be empty. model_args: CLI args for the script (e.g. --model_repo ...). - + Returns: Complete SGLang launch setup with environment configuration - + Raises: ValueError: If any parameter is invalid """ @@ -709,16 +733,22 @@ def _generate_sglang_command( if not isinstance(nnodes, int) or nnodes < 1: raise ValueError(f"nnodes must be integer >= 1, got {nnodes}") if not isinstance(nproc_per_node, int) or nproc_per_node < 1: - raise ValueError(f"nproc_per_node must be integer >= 1, got {nproc_per_node}") + raise ValueError( + f"nproc_per_node must be integer >= 1, got {nproc_per_node}" + ) if not isinstance(master_port, int) or not (1 <= master_port <= 65535): raise ValueError(f"master_port must be 1-65535, got {master_port}") if not model_script or not isinstance(model_script, str): - raise ValueError(f"model_script must be non-empty string, got {model_script}") + raise ValueError( + f"model_script must be non-empty string, got {model_script}" + ) # Run script from its directory so relative paths resolve; pass model args script_dir = str(Path(model_script).parent) script_name = Path(model_script).name - run_cmd = f"cd /workspace/{script_dir} && bash {script_name} {model_args}".strip() + run_cmd = ( + f"cd /workspace/{script_dir} && bash {script_name} {model_args}".strip() + ) # For single-node, simple TP setup if nnodes == 1: @@ -781,30 +811,30 @@ def _generate_megatron_command( ) -> str: """ Generate Megatron-LM launcher command for K8s Indexed Jobs. - + Megatron-LM is a training framework for large transformers with tensor and pipeline parallelism. It uses torchrun as the underlying launcher but with Megatron-specific environment variables. - + Architecture: - Single-node: Tensor Parallelism (TP) across GPUs - Multi-node: Tensor + Pipeline Parallelism * TP across GPUs within each node * PP across nodes - + For K8s: - Uses headless service for node discovery (like torchrun/deepspeed) - Each pod knows its rank via JOB_COMPLETION_INDEX - Sets TENSOR_MODEL_PARALLEL_SIZE and PIPELINE_MODEL_PARALLEL_SIZE (Megatron-Core standard) - + Args: nnodes: Number of nodes (pods). Must be >= 1. nproc_per_node: GPUs per node. Must be >= 1. master_port: Master communication port (for NCCL). Must be 1-65535. model_script: Path to model's run script. Cannot be empty. - + Returns: Complete Megatron-LM launch setup with environment configuration - + Raises: ValueError: If any parameter is invalid """ @@ -812,12 +842,16 @@ def _generate_megatron_command( if not isinstance(nnodes, int) or nnodes < 1: raise ValueError(f"nnodes must be integer >= 1, got {nnodes}") if not isinstance(nproc_per_node, int) or nproc_per_node < 1: - raise ValueError(f"nproc_per_node must be integer >= 1, got {nproc_per_node}") + raise ValueError( + f"nproc_per_node must be integer >= 1, got {nproc_per_node}" + ) if not isinstance(master_port, int) or not (1 <= master_port <= 65535): raise ValueError(f"master_port must be 1-65535, got {master_port}") if not model_script or not isinstance(model_script, str): - raise ValueError(f"model_script must be non-empty string, got {model_script}") - + raise ValueError( + f"model_script must be non-empty string, got {model_script}" + ) + # For single-node, use TP only if nnodes == 1: return f"""# Megatron-LM single-node setup (Tensor Parallelism) @@ -840,7 +874,7 @@ def _generate_megatron_command( --standalone \\ --nproc_per_node={nproc_per_node} \\ {model_script}""" - + # Multi-node: TP + PP else: # Use headless service for node discovery (set by template) @@ -913,7 +947,9 @@ def _generate_primus_command( manifest if isinstance(manifest, dict) else None, self.config.additional_context, ) - config_path = primus_cfg.get("config_path", "examples/torchtitan/configs/MI300X/qwen3_1.7B-pretrain.yaml") + config_path = primus_cfg.get( + "config_path", "examples/torchtitan/configs/MI300X/qwen3_1.7B-pretrain.yaml" + ) cli_extra = primus_cfg.get("cli_extra", "") config_path_quoted = config_path.replace('"', '\\"') lines = [ @@ -945,9 +981,7 @@ def _generate_primus_command( ] ) else: - master_dns = ( - f"{self.job_name}-0.{self._k8s_headless_subdomain_label}.{self.namespace}.svc.cluster.local" - ) + master_dns = f"{self.job_name}-0.{self._k8s_headless_subdomain_label}.{self.namespace}.svc.cluster.local" lines.extend( [ "# Multi-node: Indexed Job + headless Service (pod-0 DNS as master)", @@ -969,4 +1003,3 @@ def _generate_primus_command( lines.append(f"cd {script_dir} && bash {script_name}") return "\n".join(lines) - diff --git a/src/madengine/deployment/presets/__init__.py b/src/madengine/deployment/presets/__init__.py index f554fc4f..cfa6513e 100644 --- a/src/madengine/deployment/presets/__init__.py +++ b/src/madengine/deployment/presets/__init__.py @@ -3,4 +3,3 @@ Copyright (c) Advanced Micro Devices, Inc. All rights reserved. """ - diff --git a/src/madengine/deployment/presets/k8s/__init__.py b/src/madengine/deployment/presets/k8s/__init__.py index 25a33dfa..3e0eb94d 100644 --- a/src/madengine/deployment/presets/k8s/__init__.py +++ b/src/madengine/deployment/presets/k8s/__init__.py @@ -3,4 +3,3 @@ Copyright (c) Advanced Micro Devices, Inc. All rights reserved. """ - diff --git a/src/madengine/deployment/presets/k8s/defaults.json b/src/madengine/deployment/presets/k8s/defaults.json index 36fc9f3e..d42b60b5 100644 --- a/src/madengine/deployment/presets/k8s/defaults.json +++ b/src/madengine/deployment/presets/k8s/defaults.json @@ -3,7 +3,7 @@ "gpu_vendor": "AMD", "guest_os": "UBUNTU", "debug": false, - + "k8s": { "kubeconfig": "~/.kube/config", "namespace": "default", @@ -21,9 +21,8 @@ "runtime_secret_name": null } }, - + "env_vars": { "OMP_NUM_THREADS": "8" } } - diff --git a/src/madengine/deployment/presets/k8s/gpu-vendors/amd-multi-gpu.json b/src/madengine/deployment/presets/k8s/gpu-vendors/amd-multi-gpu.json index 6e559742..72e53cd8 100644 --- a/src/madengine/deployment/presets/k8s/gpu-vendors/amd-multi-gpu.json +++ b/src/madengine/deployment/presets/k8s/gpu-vendors/amd-multi-gpu.json @@ -13,4 +13,3 @@ "RCCL_ENABLE_HIPGRAPH": "0" } } - diff --git a/src/madengine/deployment/presets/k8s/gpu-vendors/amd.json b/src/madengine/deployment/presets/k8s/gpu-vendors/amd.json index 42069620..095a8212 100644 --- a/src/madengine/deployment/presets/k8s/gpu-vendors/amd.json +++ b/src/madengine/deployment/presets/k8s/gpu-vendors/amd.json @@ -4,4 +4,3 @@ "gpu_resource_name": "amd.com/gpu" } } - diff --git a/src/madengine/deployment/presets/k8s/gpu-vendors/nvidia.json b/src/madengine/deployment/presets/k8s/gpu-vendors/nvidia.json index f7831f92..618eb9e0 100644 --- a/src/madengine/deployment/presets/k8s/gpu-vendors/nvidia.json +++ b/src/madengine/deployment/presets/k8s/gpu-vendors/nvidia.json @@ -12,4 +12,3 @@ "OMP_NUM_THREADS": "12" } } - diff --git a/src/madengine/deployment/presets/k8s/profiles/multi-gpu.json b/src/madengine/deployment/presets/k8s/profiles/multi-gpu.json index f92df7f6..d3b293b1 100644 --- a/src/madengine/deployment/presets/k8s/profiles/multi-gpu.json +++ b/src/madengine/deployment/presets/k8s/profiles/multi-gpu.json @@ -13,4 +13,3 @@ "master_port": 29500 } } - diff --git a/src/madengine/deployment/presets/k8s/profiles/multi-node.json b/src/madengine/deployment/presets/k8s/profiles/multi-node.json index 3d814f38..6ccd87c0 100644 --- a/src/madengine/deployment/presets/k8s/profiles/multi-node.json +++ b/src/madengine/deployment/presets/k8s/profiles/multi-node.json @@ -19,4 +19,3 @@ "NCCL_TIMEOUT": "600" } } - diff --git a/src/madengine/deployment/presets/k8s/profiles/single-gpu.json b/src/madengine/deployment/presets/k8s/profiles/single-gpu.json index 34106655..9f04998c 100644 --- a/src/madengine/deployment/presets/k8s/profiles/single-gpu.json +++ b/src/madengine/deployment/presets/k8s/profiles/single-gpu.json @@ -8,4 +8,3 @@ "cpu_limit": "16" } } - diff --git a/src/madengine/deployment/presets/slurm/__init__.py b/src/madengine/deployment/presets/slurm/__init__.py index 9d11608c..00a1043d 100644 --- a/src/madengine/deployment/presets/slurm/__init__.py +++ b/src/madengine/deployment/presets/slurm/__init__.py @@ -12,4 +12,3 @@ Copyright (c) Advanced Micro Devices, Inc. All rights reserved. """ - diff --git a/src/madengine/deployment/presets/slurm/defaults.json b/src/madengine/deployment/presets/slurm/defaults.json index aa98b06f..f73c671a 100644 --- a/src/madengine/deployment/presets/slurm/defaults.json +++ b/src/madengine/deployment/presets/slurm/defaults.json @@ -3,11 +3,11 @@ "_description": "Default configuration for SLURM HPC cluster deployment", "_note": "Default partition is 'amd-rccl' for AMD RCCL cluster. Override if your cluster uses different partition names.", "_best_practice": "Use shared storage workspace for multi-node. Single-node auto-detects NFS and uses shared storage when available.", - + "gpu_vendor": "AMD", "guest_os": "UBUNTU", "debug": false, - + "slurm": { "partition": "amd-rccl", "nodes": 1, @@ -17,16 +17,15 @@ "exclusive": true, "modules": [] }, - + "distributed": { "backend": "nccl", "port": 29500 }, - + "env_vars": { "OMP_NUM_THREADS": "8", "MIOPEN_FIND_MODE": "1", "MIOPEN_USER_DB_PATH": "/tmp/.miopen" } } - diff --git a/src/madengine/deployment/presets/slurm/profiles/multi-node.json b/src/madengine/deployment/presets/slurm/profiles/multi-node.json index 2e499307..700bd3bb 100644 --- a/src/madengine/deployment/presets/slurm/profiles/multi-node.json +++ b/src/madengine/deployment/presets/slurm/profiles/multi-node.json @@ -1,18 +1,18 @@ { "_comment": "Multi-node SLURM profile - optimized for distributed workloads across nodes", "_description": "Configuration for multi-node distributed execution (training/inference) on SLURM cluster", - + "slurm": { "nodes": 2, "gpus_per_node": 8, "time": "24:00:00" }, - + "distributed": { "backend": "nccl", "port": 29500 }, - + "env_vars": { "NCCL_DEBUG": "WARN", "NCCL_DEBUG_SUBSYS": "INIT,NET", @@ -27,4 +27,3 @@ "RCCL_ENABLE_HIPGRAPH": "0" } } - diff --git a/src/madengine/deployment/presets/slurm/profiles/single-node.json b/src/madengine/deployment/presets/slurm/profiles/single-node.json index 7c62ef7a..439c8ebe 100644 --- a/src/madengine/deployment/presets/slurm/profiles/single-node.json +++ b/src/madengine/deployment/presets/slurm/profiles/single-node.json @@ -1,15 +1,14 @@ { "_comment": "Single-node SLURM profile - optimized for single node multi-GPU", "_description": "Configuration for running on a single SLURM node with multiple GPUs", - + "slurm": { "nodes": 1, "gpus_per_node": 8, "time": "12:00:00" }, - + "env_vars": { "NCCL_DEBUG": "WARN" } } - diff --git a/src/madengine/deployment/slurm.py b/src/madengine/deployment/slurm.py index b5eefbd4..6a4b4d71 100644 --- a/src/madengine/deployment/slurm.py +++ b/src/madengine/deployment/slurm.py @@ -17,7 +17,13 @@ from pathlib import Path from typing import Any, Dict, List, Optional -from .base import BaseDeployment, DeploymentConfig, DeploymentResult, DeploymentStatus, create_jinja_env +from .base import ( + BaseDeployment, + DeploymentConfig, + DeploymentResult, + DeploymentStatus, + create_jinja_env, +) from .primus_backend import infer_primus_backend_from_model_name, merged_primus_config from .common import configure_multi_node_profiling, normalize_launcher from .config_loader import ConfigLoader, apply_deployment_config @@ -82,9 +88,7 @@ def validate(self) -> bool: """Validate SLURM commands are available locally.""" # Check required SLURM CLI tools for tool in self.REQUIRED_TOOLS: - result = subprocess.run( - ["which", tool], capture_output=True, timeout=5 - ) + result = subprocess.run(["which", tool], capture_output=True, timeout=5) if result.returncode != 0: self.console.print( f"[red]✗ Required tool not found: {tool}[/red]\n" @@ -104,7 +108,9 @@ def validate(self) -> bool: return False if self.gpus_per_node < 1: - self.console.print(f"[red]✗ Invalid GPUs per node: {self.gpus_per_node}[/red]") + self.console.print( + f"[red]✗ Invalid GPUs per node: {self.gpus_per_node}[/red]" + ) return False self.console.print("[green]✓ SLURM environment validated[/green]") @@ -113,10 +119,10 @@ def validate(self) -> bool: def _validate_cli_availability(self) -> bool: """ Validate madengine is available before job submission. - + Compute nodes inherit the submission environment, so madengine must be available in PATH on the submission node. - + Returns: bool: True if madengine is available and functional """ @@ -126,38 +132,31 @@ def _validate_cli_availability(self) -> bool: capture_output=True, text=True, timeout=5, - check=False + check=False, ) if result.returncode == 0: version = result.stdout.strip() or "unknown" self.console.print( f"[green]✓[/green] madengine available: [cyan]{version}[/cyan]" ) - + # Show path for transparency which_result = subprocess.run( - ["which", "madengine"], - capture_output=True, - text=True, - check=False + ["which", "madengine"], capture_output=True, text=True, check=False ) if which_result.returncode == 0: cli_path = which_result.stdout.strip() self.console.print(f" Path: [dim]{cli_path}[/dim]") - + return True else: - self.console.print( - "[red]✗ madengine found but returned error[/red]" - ) + self.console.print("[red]✗ madengine found but returned error[/red]") if result.stderr: self.console.print(f" Error: {result.stderr.strip()}") return False - + except FileNotFoundError: - self.console.print( - "\n[red]✗ ERROR: madengine not found[/red]\n" - ) + self.console.print("\n[red]✗ ERROR: madengine not found[/red]\n") self.console.print( "[yellow]Compute nodes need madengine in PATH.[/yellow]\n" "\n[bold]To fix:[/bold]\n" @@ -184,7 +183,7 @@ def prepare(self) -> bool: "\n[yellow]⚠ Tip: Compute nodes inherit your submission environment[/yellow]" ) return False - + try: self.output_dir.mkdir(parents=True, exist_ok=True) @@ -230,17 +229,21 @@ def _prepare_template_context(self, model_info: Dict) -> Dict[str, Any]: additional_context = self.config.additional_context.copy() additional_context["slurm"] = self.slurm_config resolved_gpus_per_node = resolve_runtime_gpus(model_info, additional_context) - + # Extract launcher configuration - launcher_type = self.distributed_config.get("launcher", "torchrun") # Default to torchrun - + launcher_type = self.distributed_config.get( + "launcher", "torchrun" + ) # Default to torchrun + # Normalize launcher based on deployment type and validity launcher_type = normalize_launcher(launcher_type, "slurm") - + nnodes = self.distributed_config.get("nnodes", self.nodes) - nproc_per_node = self.distributed_config.get("nproc_per_node", resolved_gpus_per_node) + nproc_per_node = self.distributed_config.get( + "nproc_per_node", resolved_gpus_per_node + ) master_port = self.distributed_config.get("port", 29500) - + # Apply multi-node profiling logic if tools are configured tools = additional_context.get("tools", []) if nnodes > 1 and tools: @@ -249,28 +252,29 @@ def _prepare_template_context(self, model_info: Dict) -> Dict[str, Any]: class ConsoleLogger: def __init__(self, console): self.console = console + def info(self, msg): self.console.print(f"[cyan]{msg}[/cyan]") + def warning(self, msg): self.console.print(f"[yellow]{msg}[/yellow]") + def debug(self, msg): pass # Skip debug messages in console - + profiling_config = configure_multi_node_profiling( - nnodes=nnodes, - tools_config=tools, - logger=ConsoleLogger(self.console) + nnodes=nnodes, tools_config=tools, logger=ConsoleLogger(self.console) ) - + if profiling_config["enabled"]: tools = profiling_config["tools"] else: # rocprofv3 not available - skip profiling for multi-node tools = [] - + # Update tools in additional_context additional_context["tools"] = tools - + # Generate launcher-specific command launcher_command = self._generate_launcher_command( launcher_type=launcher_type, @@ -279,7 +283,7 @@ def debug(self, msg): master_port=master_port, model_name=model_info.get("name", "") or "", ) - + return { "model_name": model_info["name"], "manifest_file": os.path.abspath(self.config.manifest_file), @@ -306,9 +310,9 @@ def debug(self, msg): "live_output": self.config.additional_context.get("live_output", False), "tags": " ".join(model_info.get("tags", [])), "multiple_results": model_info.get("multiple_results"), - "credential_file": "credential.json" - if Path("credential.json").exists() - else None, + "credential_file": ( + "credential.json" if Path("credential.json").exists() else None + ), "data_file": "data.json" if Path("data.json").exists() else None, # Launcher configuration "launcher_type": launcher_type, @@ -329,15 +333,15 @@ def _generate_launcher_command( ) -> str: """ Generate launcher-specific command based on launcher type. - + Follows k8s pattern: different launchers have different command generation. - + Args: launcher_type: Type of launcher (torchrun, vllm, sglang, deepspeed, etc.) nnodes: Number of nodes nproc_per_node: GPUs per node master_port: Master communication port - + Returns: Launcher-specific environment setup and command string """ @@ -348,13 +352,17 @@ def _generate_launcher_command( elif launcher_type == "sglang": return self._generate_sglang_command(nnodes, nproc_per_node, master_port) elif launcher_type == "sglang-disagg" or launcher_type == "sglang_disagg": - return self._generate_sglang_disagg_command(nnodes, nproc_per_node, master_port) + return self._generate_sglang_disagg_command( + nnodes, nproc_per_node, master_port + ) elif launcher_type == "deepspeed": return self._generate_deepspeed_command(nnodes, nproc_per_node, master_port) elif launcher_type == "megatron": return self._generate_megatron_command(nnodes, nproc_per_node, master_port) elif launcher_type == "torchtitan": - return self._generate_torchtitan_command(nnodes, nproc_per_node, master_port) + return self._generate_torchtitan_command( + nnodes, nproc_per_node, master_port + ) elif launcher_type == "primus": return self._generate_primus_command( nnodes, nproc_per_node, master_port, model_name=model_name @@ -373,15 +381,15 @@ def _generate_torchrun_command( ) -> str: """ Generate torchrun launcher command for SLURM. - + For single-node (nnodes=1): Uses standalone mode For multi-node (nnodes>1): Uses distributed mode with SLURM environment - + Args: nnodes: Number of nodes nproc_per_node: GPUs per node master_port: Master port - + Returns: MAD_MULTI_NODE_RUNNER environment variable setup """ @@ -397,81 +405,81 @@ def _generate_vllm_command( ) -> str: """ Generate vLLM launcher environment variables. - + vLLM manages its own process spawning - no torchrun needed. Model script directly invokes vLLM with tensor/pipeline parallelism. - + Args: nnodes: Number of nodes nproc_per_node: GPUs per node master_port: Master port - + Returns: Environment variable setup for vLLM """ if nnodes == 1: - return f'''# vLLM single-node setup (Tensor Parallelism) + return f"""# vLLM single-node setup (Tensor Parallelism) export VLLM_TENSOR_PARALLEL_SIZE={nproc_per_node} export VLLM_PIPELINE_PARALLEL_SIZE=1 export VLLM_DISTRIBUTED_BACKEND="auto" -# vLLM handles its own process management - no MAD_MULTI_NODE_RUNNER needed''' +# vLLM handles its own process management - no MAD_MULTI_NODE_RUNNER needed""" else: # One vLLM serve per node (TP only on that node), no shared Ray = data parallelism - return f'''# vLLM multi-node setup (data parallel: one serve per node, TP only) + return f"""# vLLM multi-node setup (data parallel: one serve per node, TP only) export VLLM_TENSOR_PARALLEL_SIZE={nproc_per_node} export VLLM_PIPELINE_PARALLEL_SIZE=1 export VLLM_DISTRIBUTED_BACKEND="none" -# vLLM handles its own process management - no MAD_MULTI_NODE_RUNNER needed''' +# vLLM handles its own process management - no MAD_MULTI_NODE_RUNNER needed""" def _generate_sglang_command( self, nnodes: int, nproc_per_node: int, master_port: int ) -> str: """ Generate SGLang launcher environment variables. - + SGLang similar to vLLM - manages its own process spawning. - + Args: nnodes: Number of nodes nproc_per_node: GPUs per node master_port: Master port - + Returns: Environment variable setup for SGLang """ if nnodes == 1: - return f'''# SGLang single-node setup (Tensor Parallelism) + return f"""# SGLang single-node setup (Tensor Parallelism) export SGLANG_TENSOR_PARALLEL_SIZE={nproc_per_node} export SGLANG_PIPELINE_PARALLEL_SIZE=1 -# SGLang handles its own process management - no MAD_MULTI_NODE_RUNNER needed''' +# SGLang handles its own process management - no MAD_MULTI_NODE_RUNNER needed""" else: # One SGLang serve per node (TP only on that node), no cross-node coordination = data parallel - return f'''# SGLang multi-node setup (data parallel: one serve per node, TP only) + return f"""# SGLang multi-node setup (data parallel: one serve per node, TP only) export SGLANG_TENSOR_PARALLEL_SIZE={nproc_per_node} export SGLANG_PIPELINE_PARALLEL_SIZE=1 -# SGLang handles its own process management - no MAD_MULTI_NODE_RUNNER needed''' +# SGLang handles its own process management - no MAD_MULTI_NODE_RUNNER needed""" def _generate_sglang_disagg_command( self, nnodes: int, nproc_per_node: int, master_port: int ) -> str: """ Generate SGLang Disaggregated launcher environment for SLURM. - + SGLang Disaggregated Architecture: - Node 0: Proxy (load balancer) - Nodes 1 to xP: Prefill nodes - Nodes xP+1 to xP+yD: Decode nodes - + Minimum cluster: 3 nodes (1 proxy + 1 prefill + 1 decode) - + Args: nnodes: Total number of nodes (must be >= 3) nproc_per_node: GPUs per node (tensor parallel size) master_port: Master port for coordination - + Returns: Environment setup with node role assignment - + Raises: ValueError: If nnodes < 3 (minimum for disagg) """ @@ -480,12 +488,14 @@ def _generate_sglang_disagg_command( f"SGLang Disaggregated requires minimum 3 nodes " f"(1 proxy + 1 prefill + 1 decode), got {nnodes}" ) - + # Check if custom split is specified in additional_context - sglang_disagg_config = self.config.additional_context.get("distributed", {}).get("sglang_disagg", {}) + sglang_disagg_config = self.config.additional_context.get( + "distributed", {} + ).get("sglang_disagg", {}) prefill_nodes = sglang_disagg_config.get("prefill_nodes") decode_nodes = sglang_disagg_config.get("decode_nodes") - + if prefill_nodes is not None and decode_nodes is not None: # User specified custom split - validate if prefill_nodes < 1 or decode_nodes < 1: @@ -506,8 +516,8 @@ def _generate_sglang_disagg_command( # For N total nodes: 1 proxy + ~40% prefill + ~60% decode xP = max(1, (nnodes - 1) * 2 // 5) # ~40% of worker nodes yD = nnodes - 1 - xP # remaining nodes - - return f'''# SGLang Disaggregated multi-node setup + + return f"""# SGLang Disaggregated multi-node setup # ============================================ # Cluster Configuration: # Total Nodes: {nnodes} @@ -545,21 +555,21 @@ def _generate_sglang_disagg_command( echo "==========================================" # No MAD_MULTI_NODE_RUNNER - SGLang disagg handles process management -# Model script should detect SGLANG_DISAGG_MODE and launch appropriately''' +# Model script should detect SGLANG_DISAGG_MODE and launch appropriately""" def _generate_deepspeed_command( self, nnodes: int, nproc_per_node: int, master_port: int ) -> str: """ Generate DeepSpeed launcher command. - + DeepSpeed has its own launcher similar to torchrun. - + Args: nnodes: Number of nodes nproc_per_node: GPUs per node master_port: Master port - + Returns: MAD_MULTI_NODE_RUNNER with deepspeed launcher """ @@ -579,14 +589,14 @@ def _generate_megatron_command( ) -> str: """ Generate Megatron-LM launcher command. - + Megatron-LM typically uses torchrun but with specific environment variables. - + Args: nnodes: Number of nodes nproc_per_node: GPUs per node master_port: Master port - + Returns: MAD_MULTI_NODE_RUNNER with megatron-specific setup """ @@ -609,24 +619,24 @@ def _generate_torchtitan_command( ) -> str: """ Generate TorchTitan launcher command for SLURM. - + TorchTitan is a PyTorch native platform for LLM pre-training that uses torchrun as its underlying launcher but requires additional configuration for multi-dimensional parallelism (FSDP2, Tensor Parallel, Pipeline Parallel). - + Key TorchTitan features: - Uses TOML configuration files for training setup - Supports FSDP2, Tensor Parallel, Pipeline Parallel, Context Parallel - Built on top of torchrun for distributed coordination - + For single-node (nnodes=1): Uses standalone torchrun mode For multi-node (nnodes>1): Uses distributed torchrun with SLURM environment - + Args: nnodes: Number of nodes nproc_per_node: GPUs per node master_port: Master port - + Returns: MAD_MULTI_NODE_RUNNER with torchtitan-specific setup """ @@ -664,15 +674,21 @@ def _generate_primus_command( We only export PRIMUS_CONFIG_PATH and optional PRIMUS_CLI_EXTRA. No MAD_MULTI_NODE_RUNNER. """ primus_cfg = merged_primus_config( - self.manifest if isinstance(getattr(self, "manifest", None), dict) else None, + ( + self.manifest + if isinstance(getattr(self, "manifest", None), dict) + else None + ), self.config.additional_context, ) config_path = primus_cfg.get("config_path", "exp_pretrain.yaml") cli_extra = primus_cfg.get("cli_extra", "") # Safe shell quoting for config_path and cli_extra config_path_quoted = config_path.replace('"', '\\"') - lines = [f'# Primus launcher (model script runs run_pretrain.sh)', - f'export PRIMUS_CONFIG_PATH="{config_path_quoted}"'] + lines = [ + f"# Primus launcher (model script runs run_pretrain.sh)", + f'export PRIMUS_CONFIG_PATH="{config_path_quoted}"', + ] if (cli_extra or "").strip(): cli_extra_quoted = cli_extra.replace('"', '\\"') lines.append(f'export PRIMUS_CLI_EXTRA="{cli_extra_quoted}"') @@ -690,23 +706,23 @@ def _generate_basic_env_command( ) -> str: """ Generate basic environment variables for unknown launchers. - + Provides standard distributed execution environment variables and lets the model script handle launcher invocation. - + Args: nnodes: Number of nodes nproc_per_node: GPUs per node master_port: Master port - + Returns: Basic environment variable setup """ - return f'''# Basic distributed environment (custom launcher) + return f"""# Basic distributed environment (custom launcher) export NNODES={nnodes} export NPROC_PER_NODE={nproc_per_node} export MASTER_PORT={master_port} -# Model script should handle launcher invocation''' +# Model script should handle launcher invocation""" def deploy(self) -> DeploymentResult: """Submit sbatch script to SLURM scheduler (locally).""" @@ -724,11 +740,17 @@ def deploy(self) -> DeploymentResult: # Health-check srun invocations create SLURM jobs; we cancel them after preflight. enable_preflight = self.slurm_config.get("enable_node_check", True) auto_cleanup = self.slurm_config.get("auto_cleanup_nodes", False) - allow_submit_without_clean = self.slurm_config.get("allow_submit_without_clean_nodes", False) + allow_submit_without_clean = self.slurm_config.get( + "allow_submit_without_clean_nodes", False + ) clean_nodes: List[str] = [] health_check_job_name: Optional[str] = None - if enable_preflight and self.nodes >= 1 and not self.slurm_config.get("nodelist"): + if ( + enable_preflight + and self.nodes >= 1 + and not self.slurm_config.get("nodelist") + ): try: selector = SlurmNodeSelector( console=self.console, @@ -741,10 +763,14 @@ def deploy(self) -> DeploymentResult: exclude=self.slurm_config.get("exclude"), constraint=self.slurm_config.get("constraint"), ) - health_check_job_name = getattr(selector, "_health_check_job_name", None) + health_check_job_name = getattr( + selector, "_health_check_job_name", None + ) # Update exclude list if we found dirty/unreachable/unknown nodes - if updated_exclude and updated_exclude != self.slurm_config.get("exclude", ""): + if updated_exclude and updated_exclude != self.slurm_config.get( + "exclude", "" + ): self.console.print( f"[dim]Updated exclude list for sbatch: {updated_exclude}[/dim]\n" ) @@ -757,7 +783,9 @@ def deploy(self) -> DeploymentResult: and not allow_submit_without_clean and len(clean_nodes) < self.nodes ): - SlurmNodeSelector.cancel_health_check_jobs(health_check_job_name, self.console) + SlurmNodeSelector.cancel_health_check_jobs( + health_check_job_name, self.console + ) return DeploymentResult( status=DeploymentStatus.FAILED, deployment_id="", @@ -774,13 +802,13 @@ def deploy(self) -> DeploymentResult: self.console.print(f"[dim]Using nodelist: {nodelist_str}[/dim]\n") self.prepare() except Exception as e: - self.console.print( - f"[yellow]⚠ Node health check failed: {e}[/yellow]" - ) + self.console.print(f"[yellow]⚠ Node health check failed: {e}[/yellow]") self.console.print("[dim]Continuing with job submission[/dim]\n") finally: # Always cancel health-check jobs so they do not stay in the queue - SlurmNodeSelector.cancel_health_check_jobs(health_check_job_name, self.console) + SlurmNodeSelector.cancel_health_check_jobs( + health_check_job_name, self.console + ) # ==================== END PREFLIGHT ==================== try: @@ -842,7 +870,7 @@ def monitor(self, deployment_id: str) -> DeploymentResult: return self._check_job_completion(deployment_id) status = result.stdout.strip().upper() - + # Check if live output is enabled live_output = self.config.additional_context.get("live_output", False) @@ -881,8 +909,11 @@ def monitor(self, deployment_id: str) -> DeploymentResult: ) except Exception as e: - self.console.print(f"[red]Monitor exception for job {deployment_id}: {e}[/red]") + self.console.print( + f"[red]Monitor exception for job {deployment_id}: {e}[/red]" + ) import traceback + self.console.print(f"[dim red]{traceback.format_exc()}[/dim red]") return DeploymentResult( status=DeploymentStatus.FAILED, @@ -893,80 +924,88 @@ def monitor(self, deployment_id: str) -> DeploymentResult: def _stream_job_output(self, job_id: str, final: bool = False): """Stream output from SLURM job output file.""" # Track last position read from output file - if not hasattr(self, '_output_positions'): + if not hasattr(self, "_output_positions"): self._output_positions = {} - + # Find output file output_dir = str(self.output_dir) output_pattern = f"{output_dir}/madengine-*_{job_id}_*.out" - + try: import glob + output_files = glob.glob(output_pattern) - + if not output_files: return # Output file not created yet - + output_file = output_files[0] # Use first match - + # Read new content from file try: - with open(output_file, 'r') as f: + with open(output_file, "r") as f: # Seek to last position last_pos = self._output_positions.get(job_id, 0) f.seek(last_pos) - + # Read new lines new_content = f.read() - + if new_content: # Print new output with prefix for line in new_content.splitlines(): if line.strip(): # Skip empty lines self.console.print(f"[dim cyan]│[/dim cyan] {line}") - + # Update position self._output_positions[job_id] = f.tell() - + except FileNotFoundError: pass # File not ready yet - + except Exception as e: # Silently ignore streaming errors to not disrupt monitoring if final: - self.console.print(f"[dim yellow]Note: Could not stream output: {e}[/dim yellow]") + self.console.print( + f"[dim yellow]Note: Could not stream output: {e}[/dim yellow]" + ) def _show_log_summary(self, job_id: str, success: bool = True): """Show a summary with pointers to log files instead of streaming verbose output.""" output_dir = str(self.output_dir) - + try: import glob + # Find output and error files for this job output_files = glob.glob(f"{output_dir}/madengine-*_{job_id}_*.out") error_files = glob.glob(f"{output_dir}/madengine-*_{job_id}_*.err") - + if output_files or error_files: status_symbol = "✓" if success else "✗" status_color = "green" if success else "red" - - self.console.print(f"[{status_color}]{status_symbol}[/{status_color}] SLURM job {job_id} logs saved to:") - + + self.console.print( + f"[{status_color}]{status_symbol}[/{status_color}] SLURM job {job_id} logs saved to:" + ) + for out_file in output_files: self.console.print(f" [cyan]→[/cyan] Output: {out_file}") - + for err_file in error_files: # Check if error file has content if os.path.exists(err_file) and os.path.getsize(err_file) > 0: self.console.print(f" [yellow]→[/yellow] Errors: {err_file}") - + if not success and error_files: # Show last few lines of error file for failed jobs for err_file in error_files: if os.path.exists(err_file) and os.path.getsize(err_file) > 0: - self.console.print(f"\n[yellow]Last 10 lines of error log:[/yellow]") + self.console.print( + f"\n[yellow]Last 10 lines of error log:[/yellow]" + ) try: - with open(err_file, 'r') as f: + with open(err_file, "r") as f: lines = f.readlines() for line in lines[-10:]: if line.strip(): @@ -975,10 +1014,14 @@ def _show_log_summary(self, job_id: str, success: bool = True): pass break # Only show first error file else: - self.console.print(f"[dim yellow]Note: Log files for job {job_id} not found in {output_dir}[/dim yellow]") - + self.console.print( + f"[dim yellow]Note: Log files for job {job_id} not found in {output_dir}[/dim yellow]" + ) + except Exception as e: - self.console.print(f"[dim yellow]Note: Could not locate log files: {e}[/dim yellow]") + self.console.print( + f"[dim yellow]Note: Could not locate log files: {e}[/dim yellow]" + ) def _check_job_completion(self, job_id: str) -> DeploymentResult: """Check completed job status using sacct (locally). @@ -1012,11 +1055,13 @@ def _check_job_completion(self, job_id: str) -> DeploymentResult: if result.returncode == 0: status = result.stdout.strip().upper() - self.console.print(f"[dim]SLURM job {job_id} final status: {status}[/dim]") - + self.console.print( + f"[dim]SLURM job {job_id} final status: {status}[/dim]" + ) + # Check if live output is enabled live_output = self.config.additional_context.get("live_output", False) - + if "COMPLETED" in status: # Show final output or summary based on live_output flag if live_output: @@ -1082,9 +1127,13 @@ def _build_perf_entry_from_aggregated( run_details = { "model": model_info.get("name", aggregated_record.get("model", "")), - "n_gpus": str(aggregated_record.get("n_gpus", self.nodes * self.gpus_per_node)), + "n_gpus": str( + aggregated_record.get("n_gpus", self.nodes * self.gpus_per_node) + ), "nnodes": str(aggregated_record.get("nnodes", self.nodes)), - "gpus_per_node": str(aggregated_record.get("gpus_per_node", self.gpus_per_node)), + "gpus_per_node": str( + aggregated_record.get("gpus_per_node", self.gpus_per_node) + ), "training_precision": model_info.get("training_precision", ""), "pipeline": get_pipeline(), "args": model_info.get("args", ""), @@ -1109,7 +1158,9 @@ def _build_perf_entry_from_aggregated( "data_size": "", "data_download_duration": "", "build_number": get_build_number(), - "additional_docker_run_options": model_info.get("additional_docker_run_options", ""), + "additional_docker_run_options": model_info.get( + "additional_docker_run_options", "" + ), } flatten_tags(run_details) @@ -1164,7 +1215,9 @@ def _build_common_info_dict( "data_size": "", "data_download_duration": "", "build_number": get_build_number(), - "additional_docker_run_options": model_info.get("additional_docker_run_options", ""), + "additional_docker_run_options": model_info.get( + "additional_docker_run_options", "" + ), } flatten_tags(result) return result @@ -1202,7 +1255,9 @@ def collect_results(self, deployment_id: str) -> Dict[str, Any]: built_models_dict = self.manifest.get("built_models") or {} model_info_for_path = built_models_dict.get(model_key, {}) if model_key else {} model_name_for_path = model_info_for_path.get("name", model_key or "unknown") - model_name = model_key or "unknown" # image key for build_info / model_info_for_entry lookups + model_name = ( + model_key or "unknown" + ) # image key for build_info / model_info_for_entry lookups build_info = {} built_images = self.manifest.get("built_images") or {} @@ -1218,7 +1273,9 @@ def collect_results(self, deployment_id: str) -> Dict[str, Any]: # Gather log content per node: from job_dir/node_N/ (new) or flat output_dir .out files per_node_log_contents: List[tuple] = [] - flat_out_files = sorted(self.output_dir.glob(f"madengine-*_{deployment_id}_*.out")) + flat_out_files = sorted( + self.output_dir.glob(f"madengine-*_{deployment_id}_*.out") + ) # Multi-node: only use explicit node logs (_node_N.out) to avoid also picking up # SBATCH %t output (madengine-*__0.out, _1.out), which would duplicate metrics. if self.nodes > 1: @@ -1249,7 +1306,9 @@ def collect_results(self, deployment_id: str) -> Dict[str, Any]: # Multi-node: keep only log entries for actual node indices [0, nodes-1] if self.nodes > 1: - per_node_log_contents = [(n, c) for n, c in per_node_log_contents if n < self.nodes] + per_node_log_contents = [ + (n, c) for n, c in per_node_log_contents if n < self.nodes + ] # Copy flat logs into job_dir/node_/ for consistency if not already there. # Only create dirs for indices in [0, nodes-1] so we never create extra node_2, etc. @@ -1293,9 +1352,11 @@ def collect_results(self, deployment_id: str) -> Dict[str, Any]: ) run_details_dict: Optional[Dict[str, Any]] = None - model_info_for_entry = (self.manifest.get("built_models") or {}).get( - model_key, {} - ) if model_key else {} + model_info_for_entry = ( + (self.manifest.get("built_models") or {}).get(model_key, {}) + if model_key + else {} + ) # Multiple results path: resolve CSV from job_dir/node_*, then cwd/run_directory mult_res = model_info_for_entry.get("multiple_results") @@ -1346,22 +1407,29 @@ def collect_results(self, deployment_id: str) -> Dict[str, Any]: ) results["perf_files"] = [str(Path("perf.csv").resolve())] import csv as _csv + try: - with open(resolved_csv, "r", encoding="utf-8", errors="ignore") as f: + with open( + resolved_csv, "r", encoding="utf-8", errors="ignore" + ) as f: reader = _csv.DictReader(f) for row in reader: row = {k.strip(): v for k, v in row.items() if k} if row.get("performance") and row.get("metric"): - results["successful_runs"].append({ - "model": model_info_for_entry.get("name", "") + "_" + row.get("model", ""), - "status": "SUCCESS", - "performance": str(row.get("performance", "")), - "metric": row.get("metric", ""), - "duration": row.get("test_duration", ""), - "gpu_arch": gpu_arch, - "deployment": "slurm", - "machine": deployment_id, - }) + results["successful_runs"].append( + { + "model": model_info_for_entry.get("name", "") + + "_" + + row.get("model", ""), + "status": "SUCCESS", + "performance": str(row.get("performance", "")), + "metric": row.get("metric", ""), + "duration": row.get("test_duration", ""), + "gpu_arch": gpu_arch, + "deployment": "slurm", + "machine": deployment_id, + } + ) except Exception: pass self.console.print( @@ -1452,9 +1520,13 @@ def collect_results(self, deployment_id: str) -> Dict[str, Any]: perf_csv_path = "perf.csv" self._ensure_perf_csv_exists() if run_details_dict.get("status") == "SUCCESS": - update_perf_csv(perf_csv=perf_csv_path, single_result=str(perf_entry_path)) + update_perf_csv( + perf_csv=perf_csv_path, single_result=str(perf_entry_path) + ) else: - update_perf_csv(perf_csv=perf_csv_path, exception_result=str(perf_entry_path)) + update_perf_csv( + perf_csv=perf_csv_path, exception_result=str(perf_entry_path) + ) try: scripts_path = model_info_for_entry.get("scripts", "") scripts_base_dir = scripts_base_dir_from(scripts_path) @@ -1476,7 +1548,9 @@ def collect_results(self, deployment_id: str) -> Dict[str, Any]: num_entries=num_entries, ) except Exception as e: - self.console.print(f"[yellow]⚠ Could not update perf_super: {e}[/yellow]") + self.console.print( + f"[yellow]⚠ Could not update perf_super: {e}[/yellow]" + ) results["perf_files"] = [str(Path(perf_csv_path).resolve())] run_data = { "model": run_details_dict.get("model", ""), @@ -1558,13 +1632,10 @@ def _collect_results_parse_perf_csv( def cleanup(self, deployment_id: str) -> bool: """Cancel SLURM job if still running (locally).""" try: - subprocess.run( - ["scancel", deployment_id], capture_output=True, timeout=10 - ) + subprocess.run(["scancel", deployment_id], capture_output=True, timeout=10) self.console.print(f"[yellow]Cancelled SLURM job: {deployment_id}[/yellow]") return True except Exception as e: self.console.print(f"[yellow]⚠ Cleanup warning: {e}[/yellow]") return False - diff --git a/src/madengine/deployment/slurm_node_selector.py b/src/madengine/deployment/slurm_node_selector.py index 408e8d3c..4435d5c3 100644 --- a/src/madengine/deployment/slurm_node_selector.py +++ b/src/madengine/deployment/slurm_node_selector.py @@ -23,6 +23,7 @@ class NodeHealth(Enum): """Health status of a compute node.""" + CLEAN = "clean" # No stale processes, ready to use DIRTY = "dirty" # Has stale Ray/vLLM processes UNREACHABLE = "unreachable" # Cannot connect to node @@ -32,18 +33,19 @@ class NodeHealth(Enum): @dataclass class NodeStatus: """Status of a compute node's GPUs.""" + node: str health: NodeHealth gpu_memory_used_gb: float gpu_memory_total_gb: float process_count: int error_message: Optional[str] = None - + @property def memory_free_gb(self) -> float: """Calculate free GPU memory.""" return self.gpu_memory_total_gb - self.gpu_memory_used_gb - + @property def memory_usage_percent(self) -> float: """Calculate memory usage percentage.""" @@ -55,17 +57,17 @@ def memory_usage_percent(self) -> float: class SlurmNodeSelector: """ Selects clean GPU nodes for SLURM job allocation. - + Checks candidate nodes for stale Ray/vLLM processes that would cause OOM errors. Can automatically clean dirty nodes or recommend exclusion. """ - + # Memory threshold: nodes with >50GB used are considered dirty MEMORY_THRESHOLD_GB = 50.0 - + # Process patterns that indicate stale processes STALE_PATTERNS = ["ray::", "RayWorkerWrapper", "raylet", "vllm"] - + def __init__( self, console: Optional[Console] = None, @@ -75,7 +77,7 @@ def __init__( ): """ Initialize node selector. - + Args: console: Rich console for output auto_cleanup: Automatically clean dirty nodes @@ -86,7 +88,7 @@ def __init__( self.auto_cleanup = auto_cleanup self.verbose = verbose self.timeout = timeout - + # Max candidates to check (avoids excessive checks on large clusters) MAX_CANDIDATES_CAP = 100 @@ -111,11 +113,14 @@ def get_candidate_nodes( """ cmd = [ "sinfo", - "-p", partition, + "-p", + partition, "-N", # Node-oriented format "-h", # No header - "-o", "%N", # Node name only - "-t", "idle", # Idle nodes only + "-o", + "%N", # Node name only + "-t", + "idle", # Idle nodes only ] if constraint: @@ -138,14 +143,14 @@ def get_candidate_nodes( # Parse nodes all_nodes = set() - for line in result.stdout.strip().split('\n'): + for line in result.stdout.strip().split("\n"): line = line.strip() if line: all_nodes.add(line) # Remove excluded nodes if exclude: - excluded = set(exclude.split(',')) + excluded = set(exclude.split(",")) all_nodes -= excluded # Return all idle nodes, capped to avoid excessive checks @@ -159,8 +164,10 @@ def get_candidate_nodes( if self.verbose: self.console.print(f"[yellow]⚠ Query failed: {e}[/yellow]") return None - - def check_node_health(self, node: str, job_name: Optional[str] = None) -> NodeStatus: + + def check_node_health( + self, node: str, job_name: Optional[str] = None + ) -> NodeStatus: """ Check GPU health on a node using srun. @@ -218,7 +225,7 @@ def check_node_health(self, node: str, job_name: Optional[str] = None) -> NodeSt text=True, timeout=self.timeout, ) - + if result.returncode != 0: return NodeStatus( node=node, @@ -228,27 +235,31 @@ def check_node_health(self, node: str, job_name: Optional[str] = None) -> NodeSt process_count=0, error_message=f"srun failed: {result.stderr[:100]}", ) - + # Parse output output = result.stdout - + # Extract GPU info - gpu_info = self._extract_section(output, "===GPU_INFO===", "===END_GPU_INFO===") - processes = self._extract_section(output, "===PROCESSES===", "===END_PROCESSES===") - + gpu_info = self._extract_section( + output, "===GPU_INFO===", "===END_GPU_INFO===" + ) + processes = self._extract_section( + output, "===PROCESSES===", "===END_PROCESSES===" + ) + # Parse GPU memory (simplified - in production would parse actual output) # For MI300X: typically 192GB per GPU total_memory_gb = 192.0 * 4 # Assume 4 GPUs - + # Count processes process_count = 0 if processes and "NO_PROCESSES" not in processes: - process_count = len([l for l in processes.split('\n') if l.strip()]) - + process_count = len([l for l in processes.split("\n") if l.strip()]) + # Estimate memory usage # Rough heuristic: each process uses ~45GB (observed from Job 2437) used_memory_gb = process_count * 45.0 - + # Determine health if process_count == 0: health = NodeHealth.CLEAN @@ -256,7 +267,7 @@ def check_node_health(self, node: str, job_name: Optional[str] = None) -> NodeSt health = NodeHealth.DIRTY else: health = NodeHealth.CLEAN # Minor processes, should be OK - + return NodeStatus( node=node, health=health, @@ -264,7 +275,7 @@ def check_node_health(self, node: str, job_name: Optional[str] = None) -> NodeSt gpu_memory_total_gb=total_memory_gb, process_count=process_count, ) - + except subprocess.TimeoutExpired: return NodeStatus( node=node, @@ -283,7 +294,7 @@ def check_node_health(self, node: str, job_name: Optional[str] = None) -> NodeSt process_count=0, error_message=str(e)[:100], ) - + def cleanup_node(self, node: str, job_name: Optional[str] = None) -> bool: """ Clean up stale processes on a node using srun. @@ -332,19 +343,21 @@ def cleanup_node(self, node: str, job_name: Optional[str] = None) -> bool: text=True, timeout=self.timeout, ) - + success = result.returncode == 0 and "CLEANUP_OK" in result.stdout - + if success and self.verbose: self.console.print(f"[green] ✓ Cleaned {node}[/green]") - + return success - + except Exception as e: if self.verbose: - self.console.print(f"[yellow] ⚠ Cleanup failed for {node}: {e}[/yellow]") + self.console.print( + f"[yellow] ⚠ Cleanup failed for {node}: {e}[/yellow]" + ) return False - + def select_nodes( self, partition: str, @@ -376,10 +389,14 @@ def select_nodes( ) # Unique job name for all health-check srun invocations (enables cleanup) - self._health_check_job_name = f"madengine_nodecheck_{os.getpid()}_{int(time.time())}" + self._health_check_job_name = ( + f"madengine_nodecheck_{os.getpid()}_{int(time.time())}" + ) # Get all idle candidate nodes - candidates = self.get_candidate_nodes(partition, nodes_needed, exclude, constraint) + candidates = self.get_candidate_nodes( + partition, nodes_needed, exclude, constraint + ) if not candidates: self.console.print( @@ -389,7 +406,9 @@ def select_nodes( return [], exclude or "" if self.verbose: - self.console.print(f"[dim]Idle candidates: {len(candidates)} (checking on-demand until {nodes_needed} clean)[/dim]\n") + self.console.print( + f"[dim]Idle candidates: {len(candidates)} (checking on-demand until {nodes_needed} clean)[/dim]\n" + ) # On-demand check: stop as soon as we have enough clean nodes statuses: List[NodeStatus] = [] @@ -434,22 +453,30 @@ def select_nodes( self.console.print("[yellow]Running automatic cleanup...[/yellow]\n") for status in dirty_nodes: self.console.print(f" Cleaning {status.node}...") - if self.cleanup_node(status.node, job_name=self._health_check_job_name): + if self.cleanup_node( + status.node, job_name=self._health_check_job_name + ): time.sleep(2) - new_status = self.check_node_health(status.node, job_name=self._health_check_job_name) + new_status = self.check_node_health( + status.node, job_name=self._health_check_job_name + ) if new_status.health == NodeHealth.CLEAN: clean_nodes.append(new_status.node) nodes_to_exclude.discard(status.node) - self.console.print(f" [green]✓ {status.node} is now clean[/green]") + self.console.print( + f" [green]✓ {status.node} is now clean[/green]" + ) else: - self.console.print(f" [red]✗ {status.node} still dirty[/red]") + self.console.print( + f" [red]✗ {status.node} still dirty[/red]" + ) else: self.console.print(f" [red]✗ Cleanup failed[/red]") # Build updated exclude list (dirty + unreachable + unknown) - existing_exclude = set(exclude.split(',')) if exclude else set() + existing_exclude = set(exclude.split(",")) if exclude else set() existing_exclude.update(nodes_to_exclude) - updated_exclude = ','.join(sorted(existing_exclude)) + updated_exclude = ",".join(sorted(existing_exclude)) if unreachable_nodes or unknown_nodes: bad = [s.node for s in unreachable_nodes] + [s.node for s in unknown_nodes] @@ -473,17 +500,17 @@ def select_nodes( f"\n[yellow]⚠ Only {len(clean_nodes)} clean nodes found " f"(need {nodes_needed})[/yellow]" ) - self.console.print("[yellow]Job may wait for additional nodes to become available[/yellow]\n") - else: self.console.print( - "\n[red]❌ No clean nodes available[/red]" + "[yellow]Job may wait for additional nodes to become available[/yellow]\n" ) + else: + self.console.print("\n[red]❌ No clean nodes available[/red]") self.console.print( "[yellow]Recommendation: Wait for nodes to be cleaned or run manual cleanup[/yellow]\n" ) return clean_nodes, updated_exclude - + def _extract_section(self, text: str, start_marker: str, end_marker: str) -> str: """Extract section between markers.""" try: @@ -492,17 +519,17 @@ def _extract_section(self, text: str, start_marker: str, end_marker: str) -> str return text[start:end].strip() except ValueError: return "" - + def _display_status_table(self, statuses: List[NodeStatus]): """Display node status in a table.""" table = Table(title="Node Health Status") - + table.add_column("Node", style="cyan", no_wrap=True) table.add_column("Health", style="bold") table.add_column("Memory Used", justify="right") table.add_column("Processes", justify="right") table.add_column("Notes", style="dim") - + for status in statuses: health_style = { NodeHealth.CLEAN: "green", @@ -510,18 +537,24 @@ def _display_status_table(self, statuses: List[NodeStatus]): NodeHealth.UNREACHABLE: "red", NodeHealth.UNKNOWN: "dim", }[status.health] - + health_text = { NodeHealth.CLEAN: "✓ Clean", NodeHealth.DIRTY: "⚠ Dirty", NodeHealth.UNREACHABLE: "✗ Unreachable", NodeHealth.UNKNOWN: "? Unknown", }[status.health] - - memory_text = f"{status.gpu_memory_used_gb:.0f} GB" if status.gpu_memory_used_gb > 0 else "-" - processes_text = str(status.process_count) if status.process_count > 0 else "-" + + memory_text = ( + f"{status.gpu_memory_used_gb:.0f} GB" + if status.gpu_memory_used_gb > 0 + else "-" + ) + processes_text = ( + str(status.process_count) if status.process_count > 0 else "-" + ) notes = status.error_message if status.error_message else "" - + table.add_row( status.node, f"[{health_style}]{health_text}[/{health_style}]", @@ -529,12 +562,14 @@ def _display_status_table(self, statuses: List[NodeStatus]): processes_text, notes, ) - + self.console.print(table) self.console.print() @staticmethod - def cancel_health_check_jobs(job_name: Optional[str], console: Optional[Console] = None) -> None: + def cancel_health_check_jobs( + job_name: Optional[str], console: Optional[Console] = None + ) -> None: """ Cancel any SLURM jobs created by the node health check (srun invocations). @@ -568,6 +603,8 @@ def cancel_health_check_jobs(job_name: Optional[str], console: Optional[Console] timeout=5, ) if job_ids and _console: - _console.print(f"[dim]Cancelled {len(job_ids)} health-check job(s)[/dim]") + _console.print( + f"[dim]Cancelled {len(job_ids)} health-check job(s)[/dim]" + ) except Exception: pass diff --git a/src/madengine/deployment/templates/kubernetes/configmap.yaml.j2 b/src/madengine/deployment/templates/kubernetes/configmap.yaml.j2 index 4b782832..21d28ed2 100644 --- a/src/madengine/deployment/templates/kubernetes/configmap.yaml.j2 +++ b/src/madengine/deployment/templates/kubernetes/configmap.yaml.j2 @@ -35,4 +35,3 @@ data: {{ script_content | indent(4, first=True) }} {% endfor %} {% endif %} - diff --git a/src/madengine/deployment/templates/kubernetes/job.yaml.j2 b/src/madengine/deployment/templates/kubernetes/job.yaml.j2 index 320d049f..bbfebdc6 100644 --- a/src/madengine/deployment/templates/kubernetes/job.yaml.j2 +++ b/src/madengine/deployment/templates/kubernetes/job.yaml.j2 @@ -38,14 +38,14 @@ spec: {% if host_ipc %} hostIPC: true {% endif %} - + {% if image_pull_secrets and image_pull_secrets|length > 0 %} imagePullSecrets: {% for ips in image_pull_secrets %} - name: {{ ips.name }} {% endfor %} {% endif %} - + # Init container extracts madengine scripts from package initContainers: - name: extract-scripts @@ -55,7 +55,7 @@ spec: - | set -e echo "=== Extracting madengine scripts ===" - + # Extract common scripts from ConfigMap (since madengine not installed in container) {% if common_script_contents %} echo "Extracting common scripts from ConfigMap..." @@ -68,7 +68,7 @@ spec: {% else %} echo "No common scripts to extract" {% endif %} - + # Copy K8s data provider script from ConfigMap if it exists if [ -f /config/data_provider.sh ]; then echo "Copying data_provider.sh to /workspace/data_provider.sh" @@ -76,7 +76,7 @@ spec: chmod +x /workspace/data_provider.sh echo "✓ Copied K8s data provider script" fi - + # Extract model scripts directory (all .sh, .py, and .json files) {% if model_scripts_contents %} echo "Extracting model scripts directory..." @@ -97,7 +97,7 @@ spec: {% else %} echo "Warning: No model scripts configured" {% endif %} - + echo "✓ Script extraction complete" volumeMounts: - name: workspace @@ -105,7 +105,7 @@ spec: - name: config mountPath: /config readOnly: true - + # Main container runs benchmark containers: - name: {{ main_container_name }} @@ -124,7 +124,7 @@ spec: echo "Launcher: {{ launcher_type }}" {% endif %} echo "===================================================================" - + # Copy config files from ConfigMap to workspace cp /config/build_manifest.json /workspace/ {% if include_credential_in_configmap %} @@ -135,14 +135,14 @@ spec: echo '{}' > /workspace/credential.json {% endif %} cp /config/data.json /workspace/ 2>/dev/null || true - + # GPU Information if command -v rocm-smi &> /dev/null; then echo "" echo "=== AMD GPU Information ===" rocm-smi || true fi - + # Set GPU visibility for ROCm/CUDA # CRITICAL: Ray (vLLM, SGLang) requires ONLY ONE visibility variable # - AMD GPUs: Use ONLY HIP_VISIBLE_DEVICES @@ -174,13 +174,13 @@ spec: export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-{{ gpu_visibility }}} {% endif %} export MAD_SYSTEM_GPU_ARCHITECTURE={{ gpu_architecture }} - + # K8s environment export MAD_K8S_POD_NAME=$HOSTNAME export MAD_K8S_NAMESPACE={{ namespace }} export MAD_K8S_JOB=true export MAD_DEPLOYMENT_TYPE=kubernetes - + {% if launcher_type == "torchrun" or launcher_type == "deepspeed" or launcher_type == "megatron" or launcher_type == "primus" or launcher_type == "torchtitan" %} # {{ launcher_type }} distributed environment (auto-configured from K8s) {% if nnodes > 1 %} @@ -192,7 +192,7 @@ spec: export JOB_COMPLETION_INDEX=0 {% endif %} {% endif %} - + # Data provider environment variables {% if data_config %} echo "" @@ -203,7 +203,7 @@ spec: {% endfor %} echo "✓ Data environment configured for: {{ data_config.data_name }}" {% endif %} - + # Tools configuration environment variables {% if tools_config %} echo "" @@ -218,14 +218,14 @@ spec: {% endfor %} echo "✓ Tools configuration applied" {% endif %} - + {% if launcher_command %} # Launcher-based execution with tools echo "" echo "=== Starting benchmark with {{ launcher_type }} ===" - + cd /workspace - + # Download data if data provider is configured {% if data_provider_script and data_config %} echo "" @@ -233,14 +233,14 @@ spec: echo "Data name: {{ data_config.data_name }}" echo "Source: {{ data_config.source_url }}" echo "Target: {{ data_config.datahome }}" - + # Use K8s data provider script (loaded from ConfigMap) if [ -f /workspace/data_provider.sh ]; then bash /workspace/data_provider.sh \ "{{ data_config.data_name }}" \ "{{ data_config.source_url }}" \ "{{ data_config.datahome }}" - + # Source metrics if available if [ -f /tmp/mad_metrics.env ]; then source /tmp/mad_metrics.env @@ -251,7 +251,7 @@ spec: exit 1 fi {% endif %} - + # Run pre-scripts (like local execution) {% if pre_scripts %} echo "" @@ -269,7 +269,7 @@ spec: {% else %} echo "No pre-scripts configured" {% endif %} - + # Clear MIOpen cache to prevent "Duplicate ID" warnings echo "" echo "=== Clearing MIOpen cache ===" @@ -278,10 +278,10 @@ spec: echo "✓ Cleared MIOpen cache directory" fi mkdir -p "${MIOPEN_USER_DB_PATH:-/tmp/.miopen}" - + # Primus: experiment YAMLs are in the ConfigMap as Primus/examples/... and extracted # to /workspace/Primus (see madengine _bundle_primus_k8s_examples_overlay); PRIMUS_ROOT=/workspace/Primus. - + # Create wrapper script for launcher echo "" echo "=== Running model benchmark with launcher ===" @@ -290,7 +290,7 @@ spec: {{ launcher_command | indent(12, first=False) }} LAUNCHER_EOF chmod +x /tmp/run_launcher.sh - + {% if tools_config and tools_config|length > 0 %} # Run with profiling tools {% for tool in tools_config %} @@ -299,7 +299,7 @@ spec: {% endif %} {% endfor %} {% endif %} - + # Execute launcher with tool chain MODEL_START_TIME=$(date +%s.%N) {% if launcher_tool_chain and launcher_tool_chain != "bash /tmp/run_launcher.sh" %} @@ -311,7 +311,7 @@ spec: MODEL_END_TIME=$(date +%s.%N) MODEL_DURATION=$(awk "BEGIN {printf \"%.6f\", $MODEL_END_TIME - $MODEL_START_TIME}") echo "test_duration: ${MODEL_DURATION}s" - + # Run post-scripts (like local execution) {% if post_scripts %} echo "" @@ -329,12 +329,12 @@ spec: {% else %} echo "No post-scripts configured" {% endif %} - + # Copy artifacts to PVC shared storage (always enabled) echo "" echo "=== Copying artifacts to PVC storage ===" mkdir -p /results/${HOSTNAME} - + # Copy performance results if [ -f "perf.csv" ]; then cp perf.csv /results/${HOSTNAME}/perf.csv @@ -363,13 +363,13 @@ spec: fi fi {% endif %} - + # Copy environment details if ls *_env.csv 1> /dev/null 2>&1; then cp *_env.csv /results/${HOSTNAME}/ echo "✓ Copied environment CSV files" fi - + # Copy profiling outputs (rocprof, rocprofv3) if ls results* 1> /dev/null 2>&1; then cp -r results* /results/${HOSTNAME}/ 2>/dev/null || true @@ -391,7 +391,7 @@ spec: cp -r rocm_trace_lite_output /results/${HOSTNAME}/ 2>/dev/null || true echo "✓ Copied rocm_trace_lite_output" fi - + # Copy tool-specific outputs if ls gpu_info_*.csv 1> /dev/null 2>&1; then cp gpu_info_*.csv /results/${HOSTNAME}/ @@ -405,15 +405,15 @@ spec: cp prof.csv /results/${HOSTNAME}/ echo "✓ Copied prof.csv" fi - + echo "✓ All artifacts copied to PVC: /results/${HOSTNAME}/" - + echo "=== Benchmark job completed with exit code $MODEL_EXIT_CODE ===" exit $MODEL_EXIT_CODE {% else %} # Direct script execution cd /workspace - + # Download data if data provider is configured {% if data_provider_script and data_config %} echo "" @@ -421,14 +421,14 @@ spec: echo "Data name: {{ data_config.data_name }}" echo "Source: {{ data_config.source_url }}" echo "Target: {{ data_config.datahome }}" - + # Use K8s data provider script (loaded from ConfigMap) if [ -f /workspace/data_provider.sh ]; then bash /workspace/data_provider.sh \ "{{ data_config.data_name }}" \ "{{ data_config.source_url }}" \ "{{ data_config.datahome }}" - + # Source metrics if available if [ -f /tmp/mad_metrics.env ]; then source /tmp/mad_metrics.env @@ -439,7 +439,7 @@ spec: exit 1 fi {% endif %} - + # Run pre-scripts (like local execution) {% if pre_scripts %} echo "" @@ -457,7 +457,7 @@ spec: {% else %} echo "No pre-scripts configured" {% endif %} - + # Clear MIOpen cache to prevent "Duplicate ID" warnings echo "" echo "=== Clearing MIOpen cache ===" @@ -466,7 +466,7 @@ spec: echo "✓ Cleared MIOpen cache directory" fi mkdir -p "${MIOPEN_USER_DB_PATH:-/tmp/.miopen}" - + # Run main model script echo "" echo "=== Running model benchmark script ===" @@ -479,7 +479,7 @@ spec: {% endif %} {% endfor %} {% endif %} - + # Execute script with tool chain MODEL_START_TIME=$(date +%s.%N) {% if direct_script_tool_chain and direct_script_tool_chain != "bash " ~ model_script %} @@ -500,7 +500,7 @@ spec: ls -la /workspace/scripts/ 2>/dev/null || echo "scripts/ directory not found" exit 1 fi - + # Run post-scripts (like local execution) {% if post_scripts %} echo "" @@ -518,12 +518,12 @@ spec: {% else %} echo "No post-scripts configured" {% endif %} - + # Copy artifacts to PVC shared storage (always enabled) echo "" echo "=== Copying artifacts to PVC storage ===" mkdir -p /results/${HOSTNAME} - + # Copy performance results if [ -f "perf.csv" ]; then cp perf.csv /results/${HOSTNAME}/perf.csv @@ -552,13 +552,13 @@ spec: fi fi {% endif %} - + # Copy environment details if ls *_env.csv 1> /dev/null 2>&1; then cp *_env.csv /results/${HOSTNAME}/ echo "✓ Copied environment CSV files" fi - + # Copy profiling outputs (rocprof, rocprofv3) if ls results* 1> /dev/null 2>&1; then cp -r results* /results/${HOSTNAME}/ 2>/dev/null || true @@ -580,13 +580,13 @@ spec: cp -r rocm_trace_lite_output /results/${HOSTNAME}/ 2>/dev/null || true echo "✓ Copied rocm_trace_lite_output" fi - + # Copy GPU profiler outputs if ls gpu_info_*.csv 1> /dev/null 2>&1; then cp gpu_info_*.csv /results/${HOSTNAME}/ echo "✓ Copied GPU profiler outputs" fi - + # Copy library trace outputs if ls *_trace_output.csv 1> /dev/null 2>&1; then cp *_trace_output.csv /results/${HOSTNAME}/ @@ -596,20 +596,20 @@ spec: cp library_trace.csv /results/${HOSTNAME}/library_trace.csv echo "✓ Copied library_trace.csv" fi - + # Copy tracing outputs if ls trace.* 1> /dev/null 2>&1; then cp trace.* /results/${HOSTNAME}/ 2>/dev/null || true echo "✓ Copied tracing files" fi - + echo "✓ All artifacts copied to PVC: /results/${HOSTNAME}/" - + echo "" echo "=== Benchmark job completed with exit code ${MODEL_EXIT_CODE:-0} ===" exit ${MODEL_EXIT_CODE:-0} {% endif %} - + resources: requests: {{ gpu_resource_name }}: "{{ gpu_count }}" @@ -619,13 +619,13 @@ spec: {{ gpu_resource_name }}: "{{ gpu_count }}" memory: "{{ memory_limit }}" cpu: "{{ cpu_limit }}" - + env: {% for key, value in env_vars.items() %} - name: {{ key }} value: "{{ value }}" {% endfor %} - + volumeMounts: - name: workspace mountPath: /workspace @@ -647,7 +647,7 @@ spec: mountPath: /data readOnly: false # Must be writable for data provider downloads {% endif %} - + {% if privileged_profiling %} securityContext: capabilities: @@ -656,7 +656,7 @@ spec: seccompProfile: type: Unconfined {% endif %} - + {% if tolerations %} tolerations: {% for toleration in tolerations %} @@ -672,7 +672,7 @@ spec: {% endif %} {% endfor %} {% endif %} - + volumes: - name: workspace emptyDir: {} @@ -699,4 +699,3 @@ spec: persistentVolumeClaim: claimName: {{ data_pvc }} {% endif %} - diff --git a/src/madengine/deployment/templates/kubernetes/pvc-data.yaml.j2 b/src/madengine/deployment/templates/kubernetes/pvc-data.yaml.j2 index c5bc8396..e6aa8d00 100644 --- a/src/madengine/deployment/templates/kubernetes/pvc-data.yaml.j2 +++ b/src/madengine/deployment/templates/kubernetes/pvc-data.yaml.j2 @@ -19,4 +19,3 @@ spec: {% if storage_class %} storageClassName: {{ storage_class }} {% endif %} - diff --git a/src/madengine/deployment/templates/kubernetes/pvc.yaml.j2 b/src/madengine/deployment/templates/kubernetes/pvc.yaml.j2 index fe1395e0..953a53c8 100644 --- a/src/madengine/deployment/templates/kubernetes/pvc.yaml.j2 +++ b/src/madengine/deployment/templates/kubernetes/pvc.yaml.j2 @@ -16,4 +16,3 @@ spec: {% if storage_class %} storageClassName: {{ storage_class }} {% endif %} - diff --git a/src/madengine/deployment/templates/kubernetes/service.yaml.j2 b/src/madengine/deployment/templates/kubernetes/service.yaml.j2 index e02836ee..e5ee9f89 100644 --- a/src/madengine/deployment/templates/kubernetes/service.yaml.j2 +++ b/src/madengine/deployment/templates/kubernetes/service.yaml.j2 @@ -17,4 +17,3 @@ spec: targetPort: {{ port }} protocol: TCP {% endfor %} - diff --git a/src/madengine/deployment/templates/slurm/job.sh.j2 b/src/madengine/deployment/templates/slurm/job.sh.j2 index 5f8e8266..3d1525d8 100644 --- a/src/madengine/deployment/templates/slurm/job.sh.j2 +++ b/src/madengine/deployment/templates/slurm/job.sh.j2 @@ -215,11 +215,11 @@ echo "Verifying madengine availability..." if command -v madengine >/dev/null 2>&1; then MAD_CLI_VERSION=$(madengine --version 2>&1 | head -n1 || echo "unknown") MAD_CLI_PATH=$(which madengine 2>/dev/null || echo "unknown") - + echo " ✓ madengine available" echo " Version: $MAD_CLI_VERSION" echo " Path: $MAD_CLI_PATH" - + # Verify it's executable if madengine --help >/dev/null 2>&1; then export MAD_CLI_COMMAND="madengine" @@ -263,15 +263,15 @@ if 'deployment_config' in manifest: gpus_per_node = None if 'slurm' in manifest['deployment_config']: gpus_per_node = manifest['deployment_config']['slurm'].get('gpus_per_node') - + # Set to 'docker' instead of 'local' to force container execution manifest['deployment_config']['target'] = 'docker' - + # Remove scheduler configs (but keep built_images!) manifest['deployment_config'].pop('slurm', None) manifest['deployment_config'].pop('k8s', None) manifest['deployment_config'].pop('kubernetes', None) - + if gpus_per_node: manifest['deployment_config']['gpus_per_node'] = gpus_per_node @@ -490,11 +490,11 @@ echo "Verifying madengine availability..." if command -v madengine >/dev/null 2>&1; then MAD_CLI_VERSION=$(madengine --version 2>&1 | head -n1 || echo "unknown") MAD_CLI_PATH=$(which madengine 2>/dev/null || echo "unknown") - + echo "✓ madengine available" echo " Version: $MAD_CLI_VERSION" echo " Path: $MAD_CLI_PATH" - + # Verify it's executable if madengine --help >/dev/null 2>&1; then echo " ✓ Verified: madengine is functional" @@ -542,15 +542,15 @@ if 'deployment_config' in manifest: gpus_per_node = None if 'slurm' in manifest['deployment_config']: gpus_per_node = manifest['deployment_config']['slurm'].get('gpus_per_node') - + # Set to 'docker' instead of 'local' to force container execution manifest['deployment_config']['target'] = 'docker' - + # Remove scheduler configs (but keep built_images!) manifest['deployment_config'].pop('slurm', None) manifest['deployment_config'].pop('k8s', None) manifest['deployment_config'].pop('kubernetes', None) - + if gpus_per_node: manifest['deployment_config']['gpus_per_node'] = gpus_per_node @@ -819,4 +819,3 @@ else fi exit $EXIT_CODE - diff --git a/src/madengine/execution/__init__.py b/src/madengine/execution/__init__.py index c7be268e..a687f394 100644 --- a/src/madengine/execution/__init__.py +++ b/src/madengine/execution/__init__.py @@ -9,4 +9,3 @@ from .container_runner import ContainerRunner __all__ = ["ContainerRunner"] - diff --git a/src/madengine/execution/container_runner.py b/src/madengine/execution/container_runner.py index 2ffc8a31..1074f09e 100644 --- a/src/madengine/execution/container_runner.py +++ b/src/madengine/execution/container_runner.py @@ -29,7 +29,10 @@ update_perf_csv, flatten_tags, ) -from madengine.reporting.update_perf_super import update_perf_super_json, update_perf_super_csv +from madengine.reporting.update_perf_super import ( + update_perf_super_json, + update_perf_super_csv, +) from madengine.utils.gpu_config import resolve_runtime_gpus from madengine.utils.config_parser import ConfigParser from madengine.utils.path_utils import scripts_base_dir_from @@ -86,8 +89,7 @@ def _sh(cmd: str) -> str: host_install_type = ( "therock" if _host_rocm_path.is_dir() and is_therock_tree(_host_rocm_path) - else "apt install" if _host_rocm_path.is_dir() - else "unknown" + else "apt install" if _host_rocm_path.is_dir() else "unknown" ) try: host_rocm_ver = context._get_tool_manager().get_version() or "unknown" @@ -108,14 +110,13 @@ def _sh(cmd: str) -> str: # ROCm root: prefer rocm-sdk, then ROCM_PATH env, then /opt/rocm ctr_rocm_root = _sh( - "rocm-sdk path --root 2>/dev/null " - "|| echo \"${ROCM_PATH:-/opt/rocm}\"" + "rocm-sdk path --root 2>/dev/null " '|| echo "${ROCM_PATH:-/opt/rocm}"' ) # ROCm version: prefer rocm-sdk, then .info/version, then rocminfo ctr_rocm_ver = _sh( "rocm-sdk version 2>/dev/null " - "|| cat \"${ROCM_PATH:-/opt/rocm}/.info/version\" 2>/dev/null " + '|| cat "${ROCM_PATH:-/opt/rocm}/.info/version" 2>/dev/null ' "|| rocminfo 2>/dev/null | grep -i 'ROCm Version' | head -n1 | sed 's/.*[Vv]ersion:[[:space:]]*//;s/[[:space:]].*//;s/[^0-9.]//g' 2>/dev/null " "|| echo unknown" ) @@ -129,14 +130,16 @@ def _sh(cmd: str) -> str: # ── Host side ────────────────────────────────────────────── def _host_sh(cmd: str) -> str: try: - return subprocess.check_output(cmd, shell=True, stderr=subprocess.DEVNULL, text=True).strip() + return subprocess.check_output( + cmd, shell=True, stderr=subprocess.DEVNULL, text=True + ).strip() except Exception: return "unknown" host_cuda_root = _host_sh( "nvcc --version 2>/dev/null | sed -n 's/.*release \\([0-9][0-9.]*\\).*/\\1/p' | head -1 | " "xargs -I{} dirname $(which nvcc 2>/dev/null) 2>/dev/null | xargs dirname 2>/dev/null " - "|| echo \"${CUDA_PATH:-${CUDA_HOME:-/usr/local/cuda}}\"" + '|| echo "${CUDA_PATH:-${CUDA_HOME:-/usr/local/cuda}}"' ) host_cuda_ver = _host_sh( "nvcc --version 2>/dev/null | sed -n 's/.*release \\([0-9][0-9.]*\\).*/\\1/p' | head -1 " @@ -147,7 +150,7 @@ def _host_sh(cmd: str) -> str: # ── Container side ───────────────────────────────────────── ctr_cuda_root = _sh( "dirname $(which nvcc 2>/dev/null) 2>/dev/null | xargs dirname 2>/dev/null " - "|| echo \"${CUDA_PATH:-${CUDA_HOME:-/usr/local/cuda}}\"" + '|| echo "${CUDA_PATH:-${CUDA_HOME:-/usr/local/cuda}}"' ) ctr_cuda_ver = _sh( "nvcc --version 2>/dev/null | sed -n 's/.*release \\([0-9][0-9.]*\\).*/\\1/p' | head -1 " @@ -164,7 +167,9 @@ def _host_sh(cmd: str) -> str: rich_console.print(f"[dim]{'=' * 80}[/dim]\n") -def _resolve_multiple_results_path(multiple_results: str, model_dir: str) -> typing.Optional[str]: +def _resolve_multiple_results_path( + multiple_results: str, model_dir: str +) -> typing.Optional[str]: """Resolve multiple_results CSV path: try cwd then model_dir. Return first that exists.""" if not multiple_results: return None @@ -199,9 +204,7 @@ def _cp_model_dir_file_to_cwd_cmd(model_dir: str, relative_path: str) -> str: """``cp --`` from ``model_dir/relative`` to ``.`` with quoted paths (no injection).""" rel = (relative_path or "").strip() src = os.path.normpath(os.path.join(model_dir, rel)).replace("\\", "/") - return ( - f"cp -- {_bash_quote_path(src)} {_bash_quote_path('.')} 2>/dev/null || true" - ) + return f"cp -- {_bash_quote_path(src)} {_bash_quote_path('.')} 2>/dev/null || true" class ContainerRunner: @@ -272,36 +275,42 @@ def create_run_details_dict( # Resolve GPU count using hierarchical resolution resolved_gpu_count = resolve_runtime_gpus(model_info, self.additional_context) - + # Convert -1 (all GPUs) to actual system GPU count for accurate reporting if resolved_gpu_count == -1 and self.context: try: - system_ngpus = int(self.context.ctx["docker_env_vars"]["MAD_SYSTEM_NGPUS"]) + system_ngpus = int( + self.context.ctx["docker_env_vars"]["MAD_SYSTEM_NGPUS"] + ) resolved_gpu_count = system_ngpus - print(f"ℹ️ Converted n_gpus=-1 to actual system GPU count: {system_ngpus}") + print( + f"ℹ️ Converted n_gpus=-1 to actual system GPU count: {system_ngpus}" + ) except (KeyError, ValueError, TypeError): # If system GPU count not available, keep -1 pass - + # Determine number of nodes and GPUs per node # Priority: 1. SLURM env vars, 2. additional_context, 3. model_info, 4. default (1) nnodes = "1" # Default for local execution gpus_per_node = str(resolved_gpu_count) - + # Check for SLURM multi-node environment if os.environ.get("MAD_DEPLOYMENT_TYPE") == "slurm": # Get from SLURM environment variables (most accurate for SLURM jobs) slurm_nnodes = os.environ.get("NNODES") or os.environ.get("SLURM_NNODES") - slurm_gpus_per_node = os.environ.get("GPUS_PER_NODE") or os.environ.get("SLURM_GPUS_PER_NODE") - + slurm_gpus_per_node = os.environ.get("GPUS_PER_NODE") or os.environ.get( + "SLURM_GPUS_PER_NODE" + ) + if slurm_nnodes: nnodes = str(slurm_nnodes) print(f"ℹ️ Detected SLURM multi-node: {nnodes} nodes") - + if slurm_gpus_per_node: gpus_per_node = str(slurm_gpus_per_node) print(f"ℹ️ GPUs per node: {gpus_per_node}") - + # Fallback to additional_context (for non-SLURM or if env vars not set) if nnodes == "1" and self.additional_context: slurm_config = self.additional_context.get("slurm", {}) @@ -312,43 +321,43 @@ def create_run_details_dict( nnodes = str(ctx_nodes) if ctx_gpus: gpus_per_node = str(ctx_gpus) - + # Final fallback to model_info if nnodes == "1": nnodes = model_info.get("nnodes", "1") - + # Calculate total GPUs try: total_gpus = int(nnodes) * int(gpus_per_node) except (ValueError, TypeError): total_gpus = resolved_gpu_count - + # Extract launcher from multiple sources in priority order: # 1. additional_context (passed via --additional-context CLI arg) # 2. model_info distributed config (in models.json) # 3. MAD_LAUNCHER environment variable # 4. Default to 'docker' for local deployments launcher = "" - + # Check additional_context first (highest priority) if self.additional_context: distributed_config = self.additional_context.get("distributed", {}) launcher = distributed_config.get("launcher", "") if launcher: print(f"🚀 Launcher from additional_context: {launcher}") - + # Check model_info distributed config if not launcher and model_info.get("distributed"): launcher = model_info["distributed"].get("launcher", "") if launcher: print(f"🚀 Launcher from model_info: {launcher}") - + # Fallback to environment variable if not launcher: launcher = os.environ.get("MAD_LAUNCHER", "") if launcher: print(f"🚀 Launcher from MAD_LAUNCHER env: {launcher}") - + # Apply deployment-specific defaults if no launcher specified deployment_type = os.environ.get("MAD_DEPLOYMENT_TYPE", "local") if not launcher: @@ -363,13 +372,15 @@ def create_run_details_dict( elif deployment_type == "local": launcher = "docker" print(f"🚀 Launcher defaulted to 'docker' for local deployment") - + # Print final launcher selection if launcher: - print(f"✅ Final launcher selected: '{launcher}' (deployment_type: {deployment_type})") + print( + f"✅ Final launcher selected: '{launcher}' (deployment_type: {deployment_type})" + ) else: print(f"⚠️ No launcher specified (deployment_type: {deployment_type})") - + # Create run details dict with all required fields run_details = { "model": model_info["name"], @@ -383,10 +394,14 @@ def create_run_details_dict( "docker_file": build_info.get("dockerfile", ""), "base_docker": build_info.get("base_docker", ""), "docker_sha": build_info.get("docker_sha", ""), - "docker_image": run_results.get("docker_image", build_info.get("docker_image", "")), + "docker_image": run_results.get( + "docker_image", build_info.get("docker_image", "") + ), "git_commit": run_results.get("git_commit", ""), "machine_name": run_results.get("machine_name", ""), - "deployment_type": os.environ.get("MAD_DEPLOYMENT_TYPE", "local"), # local, slurm, etc. + "deployment_type": os.environ.get( + "MAD_DEPLOYMENT_TYPE", "local" + ), # local, slurm, etc. "launcher": launcher, # Distributed launcher: torchrun, vllm, sglang, deepspeed, etc. "gpu_architecture": ( (self.context.ctx.get("docker_env_vars") or {}).get( @@ -420,8 +435,7 @@ def create_run_details_dict( scripts_base_dir = scripts_base_dir_from(scripts_path) config_parser = ConfigParser(scripts_base_dir=scripts_base_dir) run_details["configs"] = config_parser.parse_and_load( - model_info.get("args", ""), - scripts_path + model_info.get("args", ""), scripts_path ) except Exception as e: print(f"⚠️ Warning: Could not parse config file: {e}") @@ -544,40 +558,50 @@ def pull_image( if registry and credentials: self.login_to_registry(registry, credentials) - self.rich_console.print(f"\n[bold blue]📥 Starting docker pull from registry...[/bold blue]") + self.rich_console.print( + f"\n[bold blue]📥 Starting docker pull from registry...[/bold blue]" + ) print(f"📍 Registry: {registry or 'Default'}") print(f"🏷️ Image: {registry_image}") - + # Force fresh pull on SLURM compute nodes to avoid corrupted cached layers # This prevents "permission denied" errors from corrupted image layers deployment_type = os.environ.get("MAD_DEPLOYMENT_TYPE", "local") in_slurm_job = os.environ.get("MAD_IN_SLURM_JOB", "0") == "1" - + if deployment_type == "slurm" and in_slurm_job: - print(f"🔄 Using fresh pull policy for SLURM compute node (prevents cached layer corruption)") + print( + f"🔄 Using fresh pull policy for SLURM compute node (prevents cached layer corruption)" + ) # Remove any existing cached image to force fresh pull try: self.console.sh(f"docker rmi -f {registry_image} 2>/dev/null || true") print(f"✓ Removed cached image layers") except Exception: pass # It's okay if image doesn't exist - + try: self.console.sh(f"docker pull {registry_image}") if local_name: self.console.sh(f"docker tag {registry_image} {local_name}") print(f"🏷️ Tagged as: {local_name}") - self.rich_console.print(f"[bold green]✅ Successfully pulled and tagged image[/bold green]") + self.rich_console.print( + f"[bold green]✅ Successfully pulled and tagged image[/bold green]" + ) self.rich_console.print(f"[dim]{'='*80}[/dim]") return local_name - self.rich_console.print(f"[bold green]✅ Successfully pulled image:[/bold green] [cyan]{registry_image}[/cyan]") + self.rich_console.print( + f"[bold green]✅ Successfully pulled image:[/bold green] [cyan]{registry_image}[/cyan]" + ) self.rich_console.print(f"[dim]{'='*80}[/dim]") return registry_image except Exception as e: - self.rich_console.print(f"[red]❌ Failed to pull image {registry_image}: {e}[/red]") + self.rich_console.print( + f"[red]❌ Failed to pull image {registry_image}: {e}[/red]" + ) raise def get_gpu_arg(self, requested_gpus: str) -> str: @@ -755,7 +779,7 @@ def apply_tools( # Update environment variables (always apply, even if cmd is duplicate) if "env_vars" in tool_config: run_env.update(tool_config["env_vars"]) - + # Only add cmd if it hasn't been added yet # This prevents duplicate wrappers like get_library_trace.py if "cmd" in tool_config: @@ -763,13 +787,13 @@ def apply_tools( if cmd not in added_cmds: # Prepend encapsulate cmd pre_encapsulate_post_scripts["encapsulate_script"] = ( - cmd - + " " - + pre_encapsulate_post_scripts["encapsulate_script"] + cmd + " " + pre_encapsulate_post_scripts["encapsulate_script"] ) added_cmds.add(cmd) else: - print(f" Note: Command '{cmd}' already added by another tool, skipping duplicate.") + print( + f" Note: Command '{cmd}' already added by another tool, skipping duplicate." + ) def run_pre_post_script( self, model_docker: Docker, model_dir: str, pre_post: typing.List @@ -861,7 +885,9 @@ def run_container( Returns: dict: Execution results including performance metrics """ - self.rich_console.print(f"[bold green]🏃 Running model:[/bold green] [bold cyan]{model_info['name']}[/bold cyan] [dim]in container[/dim] [yellow]{docker_image}[/yellow]") + self.rich_console.print( + f"[bold green]🏃 Running model:[/bold green] [bold cyan]{model_info['name']}[/bold cyan] [dim]in container[/dim] [yellow]{docker_image}[/yellow]" + ) # Resolve image: if model-specific image is missing, try shared primus_pretrain image (one build for all configs) docker_image = self._resolve_docker_image(docker_image, model_info["name"]) @@ -933,11 +959,11 @@ def run_container( # Add environment variables docker_options += f"--env MAD_MODEL_NAME='{model_info['name']}' " - if model_info.get('multiple_results'): - docker_options += f"--env MAD_OUTPUT_CSV='{model_info['multiple_results']}' " - docker_options += ( - f"--env JENKINS_BUILD_NUMBER='{get_build_number()}' " - ) + if model_info.get("multiple_results"): + docker_options += ( + f"--env MAD_OUTPUT_CSV='{model_info['multiple_results']}' " + ) + docker_options += f"--env JENKINS_BUILD_NUMBER='{get_build_number()}' " # Gather data and environment run_env = {} @@ -947,12 +973,14 @@ def run_container( # Also check shell environment for SLURM-passed variables if "docker_env_vars" not in self.context.ctx: self.context.ctx["docker_env_vars"] = {} - + # For SLURM jobs, check shell environment and populate additional_context with GPU info # This ensures GPU resolution works correctly if os.environ.get("MAD_DEPLOYMENT_TYPE") == "slurm": if "NPROC_PER_NODE" in os.environ or "GPUS_PER_NODE" in os.environ: - gpus_per_node_str = os.environ.get("NPROC_PER_NODE") or os.environ.get("GPUS_PER_NODE") + gpus_per_node_str = os.environ.get("NPROC_PER_NODE") or os.environ.get( + "GPUS_PER_NODE" + ) if gpus_per_node_str: try: gpus = int(gpus_per_node_str) @@ -962,44 +990,65 @@ def run_container( self.additional_context = {} if "gpus_per_node" not in self.additional_context: self.additional_context["gpus_per_node"] = gpus - print(f"ℹ️ SLURM GPU override: {gpus} GPUs per node (from shell environment)") + print( + f"ℹ️ SLURM GPU override: {gpus} GPUs per node (from shell environment)" + ) except ValueError: pass - + # List of environment variables to pass from shell to Docker (for SLURM jobs) slurm_env_vars = [ - 'MASTER_ADDR', 'MASTER_PORT', 'WORLD_SIZE', 'RANK', 'NODE_RANK', - 'NNODES', 'NPROC_PER_NODE', 'MAD_MULTI_NODE_RUNNER', - 'MAD_COLLECT_METRICS', 'NCCL_SOCKET_IFNAME', 'GLOO_SOCKET_IFNAME', - 'NCCL_DEBUG', 'NCCL_IB_DISABLE', 'NCCL_NET_GDR_LEVEL', + "MASTER_ADDR", + "MASTER_PORT", + "WORLD_SIZE", + "RANK", + "NODE_RANK", + "NNODES", + "NPROC_PER_NODE", + "MAD_MULTI_NODE_RUNNER", + "MAD_COLLECT_METRICS", + "NCCL_SOCKET_IFNAME", + "GLOO_SOCKET_IFNAME", + "NCCL_DEBUG", + "NCCL_IB_DISABLE", + "NCCL_NET_GDR_LEVEL", # Primus launcher (config path and optional CLI extra args) - 'PRIMUS_CONFIG_PATH', 'PRIMUS_CLI_EXTRA', + "PRIMUS_CONFIG_PATH", + "PRIMUS_CLI_EXTRA", # Rendezvous timeout so all nodes can join after pull - 'TORCH_ELASTIC_RDZV_TIMEOUT', + "TORCH_ELASTIC_RDZV_TIMEOUT", # GPU visibility variables for Ray-based launchers (vLLM, SGLang) # CRITICAL: These must be passed to Docker for proper GPU device mapping - 'HIP_VISIBLE_DEVICES', 'ROCR_VISIBLE_DEVICES', 'CUDA_VISIBLE_DEVICES' + "HIP_VISIBLE_DEVICES", + "ROCR_VISIBLE_DEVICES", + "CUDA_VISIBLE_DEVICES", ] - + # Check shell environment and add to docker_env_vars merged_from_env = 0 for var_name in slurm_env_vars: if var_name in os.environ: self.context.ctx["docker_env_vars"][var_name] = os.environ[var_name] merged_from_env += 1 - + # CRITICAL FIX for rocm/vllm image: Override RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES # The rocm/vllm Docker image has RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES=1 baked in, # which tells Ray to IGNORE HIP_VISIBLE_DEVICES. We must explicitly override it. # This is only needed if HIP_VISIBLE_DEVICES is set (indicating AMD GPU usage with Ray) - if 'HIP_VISIBLE_DEVICES' in self.context.ctx["docker_env_vars"]: + if "HIP_VISIBLE_DEVICES" in self.context.ctx["docker_env_vars"]: # Set to empty string to disable Ray's behavior of ignoring HIP_VISIBLE_DEVICES - self.context.ctx["docker_env_vars"]['RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES'] = '' - print("ℹ️ Overriding RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES to enable HIP_VISIBLE_DEVICES") - + self.context.ctx["docker_env_vars"][ + "RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES" + ] = "" + print( + "ℹ️ Overriding RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES to enable HIP_VISIBLE_DEVICES" + ) + if merged_from_env > 0: - print(f"ℹ️ Inherited {merged_from_env} environment variables from shell for Docker") - + print( + f"ℹ️ Inherited {merged_from_env} environment variables from shell for Docker" + ) + # Also merge from additional_context if present if self.additional_context and "docker_env_vars" in self.additional_context: merged_count = 0 @@ -1007,11 +1056,14 @@ def run_container( self.context.ctx["docker_env_vars"][key] = value merged_count += 1 if merged_count > 0: - print(f"ℹ️ Merged {merged_count} environment variables from additional_context") + print( + f"ℹ️ Merged {merged_count} environment variables from additional_context" + ) - if self.context and str(self.context.ctx.get("gpu_vendor", "")).upper().find( - "AMD" - ) != -1: + if ( + self.context + and str(self.context.ctx.get("gpu_vendor", "")).upper().find("AMD") != -1 + ): from madengine.utils.rocm_path_resolver import finalize_container_rocm_path # Determine whether the user explicitly supplied ROCM_PATH for the container. @@ -1020,13 +1072,11 @@ def run_container( # If they did not, clear any ROCM_PATH left from a previous model run so # finalize always re-resolves for the current docker_image (OCI config → # in-image probe → /opt/rocm default). - user_supplied_rocm_path = ( - str( - (self.additional_context or {}) - .get("docker_env_vars", {}) - .get("ROCM_PATH", "") - ).strip() - ) + user_supplied_rocm_path = str( + (self.additional_context or {}) + .get("docker_env_vars", {}) + .get("ROCM_PATH", "") + ).strip() if not user_supplied_rocm_path: self.context.ctx["docker_env_vars"].pop("ROCM_PATH", None) @@ -1056,7 +1106,9 @@ def run_container( # Add system environment collection script to pre_scripts # Context can explicitly disable via gen_sys_env_details: false in additional_context ctx_sys_env = self.context.ctx.get("gen_sys_env_details") - should_collect_sys_env = ctx_sys_env if ctx_sys_env is not None else generate_sys_env_details + should_collect_sys_env = ( + ctx_sys_env if ctx_sys_env is not None else generate_sys_env_details + ) if should_collect_sys_env: self.gather_system_env_details( pre_encapsulate_post_scripts, model_info["name"] @@ -1067,19 +1119,28 @@ def run_container( resolved_gpu_count = resolve_runtime_gpus(model_info, self.additional_context) docker_options += self.get_gpu_arg(str(resolved_gpu_count)) docker_options += self.get_cpu_arg() - + # Filter out MIOPEN_USER_DB_PATH from run_env if it exists # It should be passed via docker_env_vars in context instead if "MIOPEN_USER_DB_PATH" in run_env: del run_env["MIOPEN_USER_DB_PATH"] - print("ℹ️ Removed MIOPEN_USER_DB_PATH from run_env (will use context.docker_env_vars)") - + print( + "ℹ️ Removed MIOPEN_USER_DB_PATH from run_env (will use context.docker_env_vars)" + ) + # Add MIOPEN_USER_DB_PATH from shell environment to context.docker_env_vars # This is set by SLURM script with ${LOCAL_RANK} variable for per-process paths - if "MIOPEN_USER_DB_PATH" in os.environ and "MIOPEN_USER_DB_PATH" not in self.context.ctx["docker_env_vars"]: - self.context.ctx["docker_env_vars"]["MIOPEN_USER_DB_PATH"] = os.environ["MIOPEN_USER_DB_PATH"] - print(f"ℹ️ Added MIOPEN_USER_DB_PATH to docker_env_vars: {os.environ['MIOPEN_USER_DB_PATH']}") - + if ( + "MIOPEN_USER_DB_PATH" in os.environ + and "MIOPEN_USER_DB_PATH" not in self.context.ctx["docker_env_vars"] + ): + self.context.ctx["docker_env_vars"]["MIOPEN_USER_DB_PATH"] = os.environ[ + "MIOPEN_USER_DB_PATH" + ] + print( + f"ℹ️ Added MIOPEN_USER_DB_PATH to docker_env_vars: {os.environ['MIOPEN_USER_DB_PATH']}" + ) + docker_options += self.get_env_arg(run_env) docker_options += self.get_mount_arg(mount_datapaths) docker_options += f" {model_info.get('additional_docker_run_options', '')}" @@ -1088,7 +1149,7 @@ def run_container( base_container_name = "container_" + re.sub( ".*:", "", docker_image.replace("/", "_").replace(":", "_") ) - + # For multi-node SLURM jobs, add node rank to avoid name conflicts node_rank = os.environ.get("SLURM_PROCID") or os.environ.get("RANK") if node_rank is not None: @@ -1098,7 +1159,9 @@ def run_container( print(f"Docker options: {docker_options}") - self.rich_console.print(f"\n[bold blue]🏃 Starting Docker container execution...[/bold blue]") + self.rich_console.print( + f"\n[bold blue]🏃 Starting Docker container execution...[/bold blue]" + ) print(f"🏷️ Image: {docker_image}") print(f"📦 Container: {container_name}") print(f"📝 Log file: {log_file_path}") @@ -1113,7 +1176,7 @@ def run_container( ), redirect_stderr(PythonicTee(outlog, self.live_output)): # set timeout (print inside log redirection so it appears in log file) print(f"⏰ Setting timeout to {str(timeout)} seconds.") - + with Timeout(timeout): model_docker = Docker( docker_image, @@ -1142,7 +1205,9 @@ def run_container( model_docker.sh("/usr/bin/nvidia-smi || true") # Print host vs container environment summary table - _print_run_env_table(gpu_vendor, self.context, model_docker, self.rich_console) + _print_run_env_table( + gpu_vendor, self.context, model_docker, self.rich_console + ) # Prepare model directory model_dir = "run_directory" @@ -1254,20 +1319,33 @@ def run_container( and self.data ): self.data.prepare_data(model_info["data"], model_docker) - + # Capture data provider information from selected_data_provider if ( hasattr(self.data, "selected_data_provider") and self.data.selected_data_provider ): if "dataname" in self.data.selected_data_provider: - run_results["dataname"] = self.data.selected_data_provider["dataname"] - if "data_provider_type" in self.data.selected_data_provider: - run_results["data_provider_type"] = self.data.selected_data_provider["data_provider_type"] + run_results["dataname"] = ( + self.data.selected_data_provider["dataname"] + ) + if ( + "data_provider_type" + in self.data.selected_data_provider + ): + run_results["data_provider_type"] = ( + self.data.selected_data_provider[ + "data_provider_type" + ] + ) if "duration" in self.data.selected_data_provider: - run_results["data_download_duration"] = self.data.selected_data_provider["duration"] + run_results["data_download_duration"] = ( + self.data.selected_data_provider["duration"] + ) if "size" in self.data.selected_data_provider: - run_results["data_size"] = self.data.selected_data_provider["size"] + run_results["data_size"] = ( + self.data.selected_data_provider["size"] + ) print( f"Data Provider Details: {run_results.get('dataname', '')}, " f"{run_results.get('data_provider_type', '')}, " @@ -1280,7 +1358,9 @@ def run_container( # Run the model test_start_time = time.time() - self.rich_console.print("[bold blue]Running model...[/bold blue]") + self.rich_console.print( + "[bold blue]Running model...[/bold blue]" + ) model_args = self.context.ctx.get( "model_args", model_info["args"] @@ -1310,7 +1390,9 @@ def run_container( # When model writes performance to a file in run_directory, copy to cwd # so the host can read it (e.g. bind-mounted workspace) before extraction. - multiple_results_file = (model_info.get("multiple_results") or "").strip() + multiple_results_file = ( + model_info.get("multiple_results") or "" + ).strip() if multiple_results_file: try: model_docker.sh( @@ -1344,27 +1426,39 @@ def run_container( # Validate multiple results file format using proper CSV parsing try: import csv + with open(resolved_path, "r") as f: csv_reader = csv.DictReader(f) # Strip whitespace from fieldnames to handle headers like "model, performance, metric" - csv_reader.fieldnames = [f.strip() for f in csv_reader.fieldnames] + csv_reader.fieldnames = [ + f.strip() for f in csv_reader.fieldnames + ] # Check if 'performance' column exists - if 'performance' not in csv_reader.fieldnames: - print("Error: 'performance' column not found in multiple results file.") + if ( + "performance" + not in csv_reader.fieldnames + ): + print( + "Error: 'performance' column not found in multiple results file." + ) run_results["performance"] = None else: # Check if at least one row has a non-empty performance value has_valid_perf = False for row in csv_reader: - if row.get('performance', '').strip(): + if row.get( + "performance", "" + ).strip(): has_valid_perf = True break - + if not has_valid_perf: run_results["performance"] = None - print("Error: Performance metric is empty in all rows of multiple results file.") + print( + "Error: Performance metric is empty in all rows of multiple results file." + ) except Exception as e: self.rich_console.print( f"[yellow]Warning: Could not validate multiple results file: {e}[/yellow]" @@ -1377,46 +1471,73 @@ def run_container( # Extract from log file try: # Note: re and os are already imported at module level (lines 10, 15) - + # Verify log file exists and is readable if not os.path.exists(log_file_path): - print(f"Warning: Log file not found: {log_file_path}") + print( + f"Warning: Log file not found: {log_file_path}" + ) run_results["performance"] = None run_results["metric"] = None else: # Read the log file once (avoids rocprofv3 crash from shell pipelines) # This approach matches the Kubernetes implementation pattern - with open(log_file_path, 'r', encoding='utf-8', errors='ignore') as f: + with open( + log_file_path, + "r", + encoding="utf-8", + errors="ignore", + ) as f: log_content = f.read() - + # Try multiple patterns to match different log formats - + # Pattern 1: "performance: [][,] " # See PERFORMANCE_LOG_PATTERN in deployment.base for accepted formats. - match = re.search(PERFORMANCE_LOG_PATTERN, log_content) - + match = re.search( + PERFORMANCE_LOG_PATTERN, log_content + ) + if match: - run_results["performance"] = match.group(1).strip() - run_results["metric"] = match.group(2).strip() - print(f"✓ Extracted performance: {run_results['performance']} {run_results['metric']}") + run_results["performance"] = match.group( + 1 + ).strip() + run_results["metric"] = match.group( + 2 + ).strip() + print( + f"✓ Extracted performance: {run_results['performance']} {run_results['metric']}" + ) else: # Pattern 2: HuggingFace format - "'train_samples_per_second': 4.23" or "train_samples_per_second = 4.23" # This matches the actual output from HuggingFace Trainer hf_pattern = r'train_samples_per_second[\'"\s:=]+([0-9][0-9.eE+-]*)' - hf_match = re.search(hf_pattern, log_content) - + hf_match = re.search( + hf_pattern, log_content + ) + if hf_match: - run_results["performance"] = hf_match.group(1).strip() - run_results["metric"] = "samples_per_second" - print(f"✓ Extracted performance (HuggingFace format): {run_results['performance']} {run_results['metric']}") + run_results["performance"] = ( + hf_match.group(1).strip() + ) + run_results["metric"] = ( + "samples_per_second" + ) + print( + f"✓ Extracted performance (HuggingFace format): {run_results['performance']} {run_results['metric']}" + ) else: # No performance metrics found - print("Warning: Performance metric not found in expected format 'performance: NUMBER METRIC' or 'train_samples_per_second'") + print( + "Warning: Performance metric not found in expected format 'performance: NUMBER METRIC' or 'train_samples_per_second'" + ) run_results["performance"] = None run_results["metric"] = None - + except Exception as e: - print(f"Warning: Error extracting performance metrics: {e}") + print( + f"Warning: Error extracting performance metrics: {e}" + ) run_results["performance"] = None run_results["metric"] = None # Performance extraction is optional - don't fail the entire run @@ -1500,9 +1621,12 @@ def run_container( and performance_value.strip() and performance_value.strip() != "N/A" ) - + # Check if this is a worker node (not collecting metrics) - is_worker_node = os.environ.get("MAD_COLLECT_METRICS", "true").lower() == "false" + is_worker_node = ( + os.environ.get("MAD_COLLECT_METRICS", "true").lower() + == "false" + ) if has_errors: run_results["status"] = "FAILURE" @@ -1522,13 +1646,20 @@ def run_container( ) else: run_results["status"] = "FAILURE" - self.rich_console.print(f"[red]Status: FAILURE (no performance metrics)[/red]") + self.rich_console.print( + f"[red]Status: FAILURE (no performance metrics)[/red]" + ) except Exception as e: - self.rich_console.print(f"[yellow]Warning: Error in status determination: {e}[/yellow]") + self.rich_console.print( + f"[yellow]Warning: Error in status determination: {e}[/yellow]" + ) # Fallback to simple performance check # Worker nodes don't need performance metrics - is_worker_node = os.environ.get("MAD_COLLECT_METRICS", "true").lower() == "false" + is_worker_node = ( + os.environ.get("MAD_COLLECT_METRICS", "true").lower() + == "false" + ) run_results["status"] = ( "SUCCESS" if run_results.get("performance") or is_worker_node @@ -1544,8 +1675,10 @@ def run_container( # ============================================================================= # For distributed training, only master node should collect metrics # Check skip_perf_collection flag from additional_context - skip_perf = self.additional_context.get("skip_perf_collection", False) - + skip_perf = self.additional_context.get( + "skip_perf_collection", False + ) + if skip_perf: self.rich_console.print( "[cyan]ℹ️ Worker node: Skipping performance metric collection " @@ -1561,9 +1694,13 @@ def run_container( ) # Handle multiple results if specified - multiple_results = model_info.get("multiple_results", None) + multiple_results = model_info.get( + "multiple_results", None + ) resolved_multiple_results = ( - _resolve_multiple_results_path(multiple_results, model_dir) + _resolve_multiple_results_path( + multiple_results, model_dir + ) if multiple_results else None ) @@ -1574,7 +1711,12 @@ def run_container( # Generate common info JSON for multiple results common_info = run_details_dict.copy() # Remove model-specific fields for common info - for key in ["model", "performance", "metric", "status"]: + for key in [ + "model", + "performance", + "metric", + "status", + ]: common_info.pop(key, None) with open("common_info.json", "w") as f: @@ -1594,8 +1736,10 @@ def run_container( # Update perf_super.json with multiple results try: scripts_path = model_info.get("scripts", "") - scripts_base_dir = scripts_base_dir_from(scripts_path) - + scripts_base_dir = scripts_base_dir_from( + scripts_path + ) + # Reuse common_info.json for super files (no need for duplicate) num_entries = update_perf_super_json( multiple_results=resolved_multiple_results, @@ -1604,15 +1748,17 @@ def run_container( common_info="common_info.json", scripts_base_dir=scripts_base_dir, ) - + # Generate CSV and JSON files from perf_super.json update_perf_super_csv( perf_super_json="perf_super.json", perf_super_csv="perf_super.csv", - num_entries=num_entries + num_entries=num_entries, ) except Exception as e: - print(f"⚠️ Warning: Could not update perf_super files: {e}") + print( + f"⚠️ Warning: Could not update perf_super files: {e}" + ) else: # Generate single result JSON with open("perf_entry.json", "w") as f: @@ -1636,8 +1782,10 @@ def run_container( # Update perf_super.json with single result try: scripts_path = model_info.get("scripts", "") - scripts_base_dir = scripts_base_dir_from(scripts_path) - + scripts_base_dir = scripts_base_dir_from( + scripts_path + ) + # Use perf_entry.json as input (already created above) if run_results.get("status") == "SUCCESS": num_entries = update_perf_super_json( @@ -1651,18 +1799,22 @@ def run_container( perf_super_json="perf_super.json", scripts_base_dir=scripts_base_dir, ) - + # Generate CSV and JSON files from perf_super.json update_perf_super_csv( perf_super_json="perf_super.json", perf_super_csv="perf_super.csv", - num_entries=num_entries + num_entries=num_entries, ) except Exception as e: - print(f"⚠️ Warning: Could not update perf_super files: {e}") + print( + f"⚠️ Warning: Could not update perf_super files: {e}" + ) except Exception as e: - self.rich_console.print(f"[yellow]Warning: Could not update perf.csv: {e}[/yellow]") + self.rich_console.print( + f"[yellow]Warning: Could not update perf.csv: {e}[/yellow]" + ) # Copy profiler/trace output files from run_directory to base directory before cleanup # This ensures test files like gpu_info_power_profiler_output.csv and library_trace.csv are accessible @@ -1681,7 +1833,9 @@ def run_container( f"{_bash_quote_path('.')} 2>/dev/null || true" ) model_docker.sh( - _cp_model_dir_file_to_cwd_cmd(model_dir, "library_trace.csv") + _cp_model_dir_file_to_cwd_cmd( + model_dir, "library_trace.csv" + ) ) except Exception as e: # Ignore errors if no profiler/trace output files exist @@ -1745,25 +1899,27 @@ def run_container( try: scripts_path = model_info.get("scripts", "") scripts_base_dir = scripts_base_dir_from(scripts_path) - + # Use perf_entry.json as input (already created above) num_entries = update_perf_super_json( exception_result="perf_entry.json", perf_super_json="perf_super.json", scripts_base_dir=scripts_base_dir, ) - + # Generate CSV and JSON files from perf_super.json update_perf_super_csv( perf_super_json="perf_super.json", perf_super_csv="perf_super.csv", - num_entries=num_entries + num_entries=num_entries, ) except Exception as e: print(f"⚠️ Warning: Could not update perf_super files: {e}") except Exception as csv_e: - self.rich_console.print(f"[yellow]Warning: Could not update perf.csv with exception: {csv_e}[/yellow]") + self.rich_console.print( + f"[yellow]Warning: Could not update perf.csv with exception: {csv_e}[/yellow]" + ) return run_results @@ -1799,67 +1955,94 @@ def run_models_from_manifest( Returns: dict: Execution summary with successful and failed runs """ - self.rich_console.print(f"[bold blue]📦 Loading manifest:[/bold blue] {manifest_file}") - + self.rich_console.print( + f"[bold blue]📦 Loading manifest:[/bold blue] {manifest_file}" + ) + # Load manifest manifest = self.load_build_manifest(manifest_file) built_images = manifest.get("built_images", {}) built_models = manifest.get("built_models", {}) - + # Load deployment_config from manifest for GPU resolution if "deployment_config" in manifest and not self.additional_context: - self.additional_context = {"deployment_config": manifest["deployment_config"]} + self.additional_context = { + "deployment_config": manifest["deployment_config"] + } # Merge manifest context (e.g. skip_perf_collection for multi-node SLURM aggregation) if "context" in manifest and isinstance(manifest["context"], dict): - self.additional_context = {**(self.additional_context or {}), **manifest["context"]} + self.additional_context = { + **(self.additional_context or {}), + **manifest["context"], + } if not built_images: self.rich_console.print("[yellow]⚠️ No images found in manifest[/yellow]") return {"successful_runs": [], "failed_runs": []} - - self.rich_console.print(f"[green]Found {len(built_images)} image(s) to run[/green]\n") - + + self.rich_console.print( + f"[green]Found {len(built_images)} image(s) to run[/green]\n" + ) + # Login to registry if needed if registry or any(img.get("registry") for img in built_images.values()): effective_registry = registry or next( - (img.get("registry") for img in built_images.values() if img.get("registry")), - None + ( + img.get("registry") + for img in built_images.values() + if img.get("registry") + ), + None, ) if effective_registry: try: self.login_to_registry(effective_registry, self.credentials) except Exception as e: - self.rich_console.print(f"[yellow]Warning: Registry login failed: {e}[/yellow]") - self.rich_console.print("[yellow]Proceeding with local images only[/yellow]\n") - + self.rich_console.print( + f"[yellow]Warning: Registry login failed: {e}[/yellow]" + ) + self.rich_console.print( + "[yellow]Proceeding with local images only[/yellow]\n" + ) + # Track results successful_runs = [] failed_runs = [] - + # Run each model for image_name, build_info in built_images.items(): model_info = built_models.get(image_name, {}) if not model_info: - self.rich_console.print(f"[yellow]⚠️ No model info for {image_name}, skipping[/yellow]") + self.rich_console.print( + f"[yellow]⚠️ No model info for {image_name}, skipping[/yellow]" + ) continue - + try: # Handle different image sources if build_info.get("local_image"): # Local image mode (MAD_CONTAINER_IMAGE): Use the provided image directly run_image = build_info.get("docker_image") - self.rich_console.print(f"[yellow]🏠 Using local image: {run_image}[/yellow]") - + self.rich_console.print( + f"[yellow]🏠 Using local image: {run_image}[/yellow]" + ) + # Verify image exists try: - self.console.sh(f"docker image inspect {run_image} > /dev/null 2>&1") + self.console.sh( + f"docker image inspect {run_image} > /dev/null 2>&1" + ) except (subprocess.CalledProcessError, RuntimeError) as e: - self.rich_console.print(f"[yellow]⚠️ Image {run_image} not found, attempting to pull...[/yellow]") + self.rich_console.print( + f"[yellow]⚠️ Image {run_image} not found, attempting to pull...[/yellow]" + ) try: self.pull_image(run_image) except Exception as e: - raise RuntimeError(f"Failed to find or pull local image {run_image}: {e}") - + raise RuntimeError( + f"Failed to find or pull local image {run_image}: {e}" + ) + elif build_info.get("registry_image"): # Registry image: Pull from registry try: @@ -1867,12 +2050,14 @@ def run_models_from_manifest( # Update docker_image to use registry image run_image = build_info["registry_image"] except Exception as pull_error: - self.rich_console.print(f"[yellow]Warning: Could not pull from registry, using local image[/yellow]") + self.rich_console.print( + f"[yellow]Warning: Could not pull from registry, using local image[/yellow]" + ) run_image = image_name else: # Normal built image: Use the image name directly run_image = image_name - + # Run the container run_results = self.run_container( model_info=model_info, @@ -1883,38 +2068,49 @@ def run_models_from_manifest( timeout=timeout, phase_suffix=phase_suffix, ) - + # Check actual status and track accordingly status = run_results.get("status", "SUCCESS") if status == "SUCCESS": - successful_runs.append({ - "model": model_info["name"], - "image": run_image, - "status": status, - "performance": run_results.get("performance"), - "duration": run_results.get("test_duration"), - }) + successful_runs.append( + { + "model": model_info["name"], + "image": run_image, + "status": status, + "performance": run_results.get("performance"), + "duration": run_results.get("test_duration"), + } + ) else: # Status is FAILURE - track as failed - failed_runs.append({ - "model": model_info["name"], - "image": run_image, - "status": status, - "error": "Container execution failed - check logs for details", - }) - self.rich_console.print(f"[red]❌ Run failed for {model_info['name']}: Status={status}[/red]") - + failed_runs.append( + { + "model": model_info["name"], + "image": run_image, + "status": status, + "error": "Container execution failed - check logs for details", + } + ) + self.rich_console.print( + f"[red]❌ Run failed for {model_info['name']}: Status={status}[/red]" + ) + except Exception as e: - self.rich_console.print(f"[red]❌ Failed to run {model_info['name']}: {e}[/red]") + self.rich_console.print( + f"[red]❌ Failed to run {model_info['name']}: {e}[/red]" + ) error_msg = str(e) - failed_runs.append({ - "model": model_info.get("name", image_name), - "image": image_name, - "error": error_msg, - }) + failed_runs.append( + { + "model": model_info.get("name", image_name), + "image": image_name, + "error": error_msg, + } + ) # Record failure in performance table so status is consistent and table is complete try: import tempfile + self.ensure_perf_csv_exists() perf_entry = self._create_setup_failure_perf_entry( model_info=model_info, @@ -1941,12 +2137,14 @@ def run_models_from_manifest( self.rich_console.print( f"[yellow]Warning: Could not record setup failure to perf CSV: {csv_e}[/yellow]" ) - + # Summary self.rich_console.print(f"\n[bold]📊 Execution Summary:[/bold]") - self.rich_console.print(f" [green]✓ Successful:[/green] {len(successful_runs)}") + self.rich_console.print( + f" [green]✓ Successful:[/green] {len(successful_runs)}" + ) self.rich_console.print(f" [red]✗ Failed:[/red] {len(failed_runs)}") - + return { "successful_runs": successful_runs, "failed_runs": failed_runs, diff --git a/src/madengine/execution/container_runner_helpers.py b/src/madengine/execution/container_runner_helpers.py index dfa99be5..63325e7a 100644 --- a/src/madengine/execution/container_runner_helpers.py +++ b/src/madengine/execution/container_runner_helpers.py @@ -205,19 +205,13 @@ def _docker_image_ref_for_log_naming(docker_image: str) -> str: ref_without_digest = s.split("@", 1)[0] last_slash = ref_without_digest.rfind("/") tail = ( - ref_without_digest[last_slash + 1 :] - if last_slash >= 0 - else ref_without_digest + ref_without_digest[last_slash + 1 :] if last_slash >= 0 else ref_without_digest ) if ":" in tail: _, tag = tail.split(":", 1) if tag.startswith("ci-"): return tag - return ( - ref_without_digest.replace("/", "_") - .replace(":", "_") - .replace("@", "_") - ) + return ref_without_digest.replace("/", "_").replace(":", "_").replace("@", "_") def make_run_log_file_path( diff --git a/src/madengine/execution/docker_builder.py b/src/madengine/execution/docker_builder.py index 56f33d6d..b7130def 100644 --- a/src/madengine/execution/docker_builder.py +++ b/src/madengine/execution/docker_builder.py @@ -7,22 +7,24 @@ and then distributed to remote nodes for execution. """ +import json import os +import re import shlex import time -import json -import re import typing -from contextlib import redirect_stdout, redirect_stderr +from contextlib import redirect_stderr, redirect_stdout + from rich.console import Console as RichConsole + from madengine.core.auth import login_to_registry from madengine.core.console import Console from madengine.core.context import Context -from madengine.utils.ops import PythonicTee from madengine.execution.dockerfile_utils import ( is_target_arch_compatible_with_variable, parse_dockerfile_gpu_variables, ) +from madengine.utils.ops import PythonicTee class DockerBuilder: @@ -93,7 +95,13 @@ def get_build_arg(self, run_build_arg: typing.Optional[typing.Dict] = None) -> s if run_build_arg: for key, value in run_build_arg.items(): - build_args += "--build-arg " + shlex.quote(str(key)) + "=" + shlex.quote(str(value)) + " " + build_args += ( + "--build-arg " + + shlex.quote(str(key)) + + "=" + + shlex.quote(str(value)) + + " " + ) return build_args @@ -146,7 +154,9 @@ def build_image( # Replace / with _ in log file path (already done above, but keeping for safety) log_file_path = log_file_path.replace("/", "_") - self.rich_console.print(f"\n[bold green]🔨 Starting Docker build for model:[/bold green] [bold cyan]{model_info['name']}[/bold cyan]") + self.rich_console.print( + f"\n[bold green]🔨 Starting Docker build for model:[/bold green] [bold cyan]{model_info['name']}[/bold cyan]" + ) print(f"📁 Dockerfile: {dockerfile}") print(f"🏷️ Target image: {docker_image}") print(f"📝 Build log: {log_file_path}") @@ -195,7 +205,9 @@ def build_image( print(f"⏱️ Build Duration: {build_duration:.2f} seconds") print(f"🏷️ MAD_CONTAINER_IMAGE is {docker_image}") - self.rich_console.print(f"[bold green]✅ Docker build completed successfully[/bold green]") + self.rich_console.print( + f"[bold green]✅ Docker build completed successfully[/bold green]" + ) self.rich_console.print(f"[dim]{'='*80}[/dim]") # Get base docker info @@ -220,11 +232,13 @@ def build_image( ) print(f"BASE DOCKER SHA is {docker_sha}") except Exception as e: - self.rich_console.print(f"[yellow]Warning: Could not get docker SHA: {e}[/yellow]") + self.rich_console.print( + f"[yellow]Warning: Could not get docker SHA: {e}[/yellow]" + ) # Infer GPU vendor from dockerfile path gpu_vendor = self._infer_gpu_vendor_from_dockerfile(dockerfile) - + build_info = { "model": model_info["name"], "docker_image": docker_image, @@ -243,7 +257,9 @@ def build_image( # Store model info linked to the built image self.built_models[docker_image] = model_info - self.rich_console.print(f"[bold green]Successfully built image:[/bold green] [cyan]{docker_image}[/cyan]") + self.rich_console.print( + f"[bold green]Successfully built image:[/bold green] [cyan]{docker_image}[/cyan]" + ) return build_info @@ -306,17 +322,23 @@ def push_image( # Push the image push_command = f"docker push {registry_image}" - self.rich_console.print(f"\n[bold blue]🚀 Starting docker push to registry...[/bold blue]") + self.rich_console.print( + f"\n[bold blue]🚀 Starting docker push to registry...[/bold blue]" + ) print(f"📤 Registry: {registry}") print(f"🏷️ Image: {registry_image}") self.console.sh(push_command) - self.rich_console.print(f"[bold green]✅ Successfully pushed image to registry:[/bold green] [cyan]{registry_image}[/cyan]") + self.rich_console.print( + f"[bold green]✅ Successfully pushed image to registry:[/bold green] [cyan]{registry_image}[/cyan]" + ) self.rich_console.print(f"[dim]{'='*80}[/dim]") return registry_image except Exception as e: - self.rich_console.print(f"[red]❌ Failed to push image {docker_image} to registry {registry}: {e}[/red]") + self.rich_console.print( + f"[red]❌ Failed to push image {docker_image} to registry {registry}: {e}[/red]" + ) raise def export_build_manifest( @@ -379,18 +401,20 @@ def export_build_manifest( }, "credentials_required": credentials_required, } - + # Preserve tools configuration if present in context if "tools" in self.context.ctx: manifest["context"]["tools"] = self.context.ctx["tools"] - + # Preserve pre/post scripts if present in context if "pre_scripts" in self.context.ctx: manifest["context"]["pre_scripts"] = self.context.ctx["pre_scripts"] if "post_scripts" in self.context.ctx: manifest["context"]["post_scripts"] = self.context.ctx["post_scripts"] if "encapsulate_script" in self.context.ctx: - manifest["context"]["encapsulate_script"] = self.context.ctx["encapsulate_script"] + manifest["context"]["encapsulate_script"] = self.context.ctx[ + "encapsulate_script" + ] # Add push failure summary if any pushes failed push_failures = [] @@ -410,9 +434,13 @@ def export_build_manifest( with open(output_file, "w") as f: json.dump(manifest, f, indent=2) - self.rich_console.print(f"[green]Build manifest exported to:[/green] {output_file}") + self.rich_console.print( + f"[green]Build manifest exported to:[/green] {output_file}" + ) if push_failures: - self.rich_console.print(f"[yellow]Warning: {len(push_failures)} image(s) failed to push to registry[/yellow]") + self.rich_console.print( + f"[yellow]Warning: {len(push_failures)} image(s) failed to push to registry[/yellow]" + ) for failure in push_failures: self.rich_console.print( f"[red] - {failure['image']} -> {failure['intended_registry_image']}: {failure['error']}[/red]" @@ -442,12 +470,18 @@ def build_all_models( Returns: dict: Summary of all built images """ - self.rich_console.print(f"[bold blue]Building Docker images for {len(models)} models...[/bold blue]") - + self.rich_console.print( + f"[bold blue]Building Docker images for {len(models)} models...[/bold blue]" + ) + if target_archs: - self.rich_console.print(f"[bold cyan]Multi-architecture build mode enabled for: {', '.join(target_archs)}[/bold cyan]") + self.rich_console.print( + f"[bold cyan]Multi-architecture build mode enabled for: {', '.join(target_archs)}[/bold cyan]" + ) else: - self.rich_console.print(f"[bold cyan]Single architecture build mode[/bold cyan]") + self.rich_console.print( + f"[bold cyan]Single architecture build mode[/bold cyan]" + ) build_summary = { "successful_builds": [], @@ -456,69 +490,89 @@ def build_all_models( "successful_pushes": [], "failed_pushes": [], } - + for model_info in models: # Check if MAD_SYSTEM_GPU_ARCHITECTURE is provided in additional_context # This overrides --target-archs and uses default flow - if ("docker_build_arg" in self.context.ctx and - "MAD_SYSTEM_GPU_ARCHITECTURE" in self.context.ctx["docker_build_arg"]): - self.rich_console.print(f"[yellow]Info: MAD_SYSTEM_GPU_ARCHITECTURE provided in additional_context, " - f"disabling --target-archs and using default flow for model {model_info['name']}[/yellow]") + if ( + "docker_build_arg" in self.context.ctx + and "MAD_SYSTEM_GPU_ARCHITECTURE" + in self.context.ctx["docker_build_arg"] + ): + self.rich_console.print( + f"[yellow]Info: MAD_SYSTEM_GPU_ARCHITECTURE provided in additional_context, " + f"disabling --target-archs and using default flow for model {model_info['name']}[/yellow]" + ) # Use single architecture build mode regardless of target_archs try: single_build_info = self._build_model_single_arch( - model_info, credentials, clean_cache, - registry, phase_suffix, batch_build_metadata + model_info, + credentials, + clean_cache, + registry, + phase_suffix, + batch_build_metadata, ) build_summary["successful_builds"].extend(single_build_info) build_summary["total_build_time"] += sum( info.get("build_duration", 0) for info in single_build_info ) except Exception as e: - build_summary["failed_builds"].append({ - "model": model_info["name"], - "error": str(e) - }) + build_summary["failed_builds"].append( + {"model": model_info["name"], "error": str(e)} + ) elif target_archs: # Multi-architecture build mode - always use architecture suffix for arch in target_archs: try: # Always build with architecture suffix when --target-archs is used arch_build_info = self._build_model_for_arch( - model_info, arch, credentials, clean_cache, - registry, phase_suffix, batch_build_metadata + model_info, + arch, + credentials, + clean_cache, + registry, + phase_suffix, + batch_build_metadata, ) - + build_summary["successful_builds"].extend(arch_build_info) build_summary["total_build_time"] += sum( info.get("build_duration", 0) for info in arch_build_info ) except Exception as e: - build_summary["failed_builds"].append({ - "model": model_info["name"], - "architecture": arch, - "error": str(e) - }) + build_summary["failed_builds"].append( + { + "model": model_info["name"], + "architecture": arch, + "error": str(e), + } + ) else: # Single architecture build mode (existing behavior - no validation needed) try: single_build_info = self._build_model_single_arch( - model_info, credentials, clean_cache, - registry, phase_suffix, batch_build_metadata + model_info, + credentials, + clean_cache, + registry, + phase_suffix, + batch_build_metadata, ) build_summary["successful_builds"].extend(single_build_info) build_summary["total_build_time"] += sum( info.get("build_duration", 0) for info in single_build_info ) except Exception as e: - build_summary["failed_builds"].append({ - "model": model_info["name"], - "error": str(e) - }) - + build_summary["failed_builds"].append( + {"model": model_info["name"], "error": str(e)} + ) + return build_summary - def _check_dockerfile_has_gpu_variables(self, model_info: typing.Dict) -> typing.Tuple[bool, str]: + def _check_dockerfile_has_gpu_variables( + self, model_info: typing.Dict + ) -> typing.Tuple[bool, str]: """ Check if model's Dockerfile contains GPU architecture variables. Returns (has_gpu_vars, dockerfile_path) @@ -526,24 +580,26 @@ def _check_dockerfile_has_gpu_variables(self, model_info: typing.Dict) -> typing try: # Find dockerfiles for this model dockerfiles = self._get_dockerfiles_for_model(model_info) - + for dockerfile_path in dockerfiles: - with open(dockerfile_path, 'r') as f: + with open(dockerfile_path, "r") as f: dockerfile_content = f.read() - + # Parse GPU architecture variables from Dockerfile dockerfile_gpu_vars = parse_dockerfile_gpu_variables(dockerfile_content) - + if dockerfile_gpu_vars: return True, dockerfile_path else: return False, dockerfile_path - + # No dockerfiles found return False, "No Dockerfile found" - + except Exception as e: - self.rich_console.print(f"[yellow]Warning: Error checking GPU variables for model {model_info['name']}: {e}[/yellow]") + self.rich_console.print( + f"[yellow]Warning: Error checking GPU variables for model {model_info['name']}: {e}[/yellow]" + ) return False, "Error reading Dockerfile" def _get_dockerfiles_for_model(self, model_info: typing.Dict) -> typing.List[str]: @@ -551,9 +607,7 @@ def _get_dockerfiles_for_model(self, model_info: typing.Dict) -> typing.List[str try: # Quote the dockerfile path to prevent shell injection dockerfile_quoted = shlex.quote(model_info["dockerfile"]) - all_dockerfiles = self.console.sh( - f"ls {dockerfile_quoted}.*" - ).split("\n") + all_dockerfiles = self.console.sh(f"ls {dockerfile_quoted}.*").split("\n") dockerfiles = {} for cur_docker_file in all_dockerfiles: @@ -564,14 +618,18 @@ def _get_dockerfiles_for_model(self, model_info: typing.Dict) -> typing.List[str # Filter dockerfiles based on context dockerfiles = self.context.filter(dockerfiles) - + return list(dockerfiles.keys()) - + except Exception as e: - self.rich_console.print(f"[yellow]Warning: Error finding dockerfiles for model {model_info['name']}: {e}[/yellow]") + self.rich_console.print( + f"[yellow]Warning: Error finding dockerfiles for model {model_info['name']}: {e}[/yellow]" + ) return [] - def _validate_target_arch_against_dockerfile(self, model_info: typing.Dict, target_arch: str) -> bool: + def _validate_target_arch_against_dockerfile( + self, model_info: typing.Dict, target_arch: str + ) -> bool: """ Validate that target architecture is compatible with model's Dockerfile GPU variables. Called during build phase when --target-archs is provided. @@ -579,71 +637,77 @@ def _validate_target_arch_against_dockerfile(self, model_info: typing.Dict, targ try: # Find dockerfiles for this model dockerfiles = self._get_dockerfiles_for_model(model_info) - + for dockerfile_path in dockerfiles: - with open(dockerfile_path, 'r') as f: + with open(dockerfile_path, "r") as f: dockerfile_content = f.read() - + # Parse GPU architecture variables from Dockerfile dockerfile_gpu_vars = parse_dockerfile_gpu_variables(dockerfile_content) - + if not dockerfile_gpu_vars: # No GPU variables found - target arch is acceptable - self.rich_console.print(f"[cyan]Info: No GPU architecture variables found in {dockerfile_path}, " - f"target architecture '{target_arch}' is acceptable[/cyan]") + self.rich_console.print( + f"[cyan]Info: No GPU architecture variables found in {dockerfile_path}, " + f"target architecture '{target_arch}' is acceptable[/cyan]" + ) continue - + # Validate target architecture against each GPU variable for var_name, var_values in dockerfile_gpu_vars.items(): if not is_target_arch_compatible_with_variable( var_name, var_values, target_arch ): - self.rich_console.print(f"[red]Error: Target architecture '{target_arch}' is not compatible " - f"with {var_name}={var_values} in {dockerfile_path}[/red]") + self.rich_console.print( + f"[red]Error: Target architecture '{target_arch}' is not compatible " + f"with {var_name}={var_values} in {dockerfile_path}[/red]" + ) return False - - self.rich_console.print(f"[cyan]Info: Target architecture '{target_arch}' validated successfully " - f"against {dockerfile_path}[/cyan]") - + + self.rich_console.print( + f"[cyan]Info: Target architecture '{target_arch}' validated successfully " + f"against {dockerfile_path}[/cyan]" + ) + return True - + except FileNotFoundError as e: - self.rich_console.print(f"[yellow]Warning: Dockerfile not found for model {model_info['name']}: {e}[/yellow]") + self.rich_console.print( + f"[yellow]Warning: Dockerfile not found for model {model_info['name']}: {e}[/yellow]" + ) return True # Assume compatible if Dockerfile not found except Exception as e: - self.rich_console.print(f"[yellow]Warning: Error validating target architecture for model {model_info['name']}: {e}[/yellow]") + self.rich_console.print( + f"[yellow]Warning: Error validating target architecture for model {model_info['name']}: {e}[/yellow]" + ) return True # Assume compatible on parsing errors def _build_model_single_arch( - self, + self, model_info: typing.Dict, credentials: typing.Dict, clean_cache: bool, registry: str, phase_suffix: str, - batch_build_metadata: typing.Optional[dict] + batch_build_metadata: typing.Optional[dict], ) -> typing.List[typing.Dict]: """Build model using existing single architecture flow.""" - + # Use existing build logic - MAD_SYSTEM_GPU_ARCHITECTURE comes from additional_context # or Dockerfile defaults dockerfiles = self._get_dockerfiles_for_model(model_info) - + results = [] for dockerfile in dockerfiles: build_info = self.build_image( - model_info, - dockerfile, - credentials, - clean_cache, - phase_suffix + model_info, dockerfile, credentials, clean_cache, phase_suffix ) - + # Extract GPU architecture from build args or context for manifest gpu_arch = self._get_effective_gpu_architecture(model_info, dockerfile) if gpu_arch: build_info["gpu_architecture"] = gpu_arch - + # Handle registry push (existing logic) if registry: try: @@ -654,73 +718,82 @@ def _build_model_single_arch( model_info, credentials, ) - self.push_image(build_info["docker_image"], registry, credentials, registry_image) + self.push_image( + build_info["docker_image"], + registry, + credentials, + registry_image, + ) build_info["registry_image"] = registry_image except Exception as e: build_info["push_error"] = str(e) - + results.append(build_info) - + return results - def _get_effective_gpu_architecture(self, model_info: typing.Dict, dockerfile_path: str) -> str: + def _get_effective_gpu_architecture( + self, model_info: typing.Dict, dockerfile_path: str + ) -> str: """Get effective GPU architecture for single arch builds.""" # Check if MAD_SYSTEM_GPU_ARCHITECTURE is in build args from additional_context - if ("docker_build_arg" in self.context.ctx and - "MAD_SYSTEM_GPU_ARCHITECTURE" in self.context.ctx["docker_build_arg"]): + if ( + "docker_build_arg" in self.context.ctx + and "MAD_SYSTEM_GPU_ARCHITECTURE" in self.context.ctx["docker_build_arg"] + ): return self.context.ctx["docker_build_arg"]["MAD_SYSTEM_GPU_ARCHITECTURE"] - + # Try to extract from Dockerfile defaults try: - with open(dockerfile_path, 'r') as f: + with open(dockerfile_path, "r") as f: content = f.read() - + # Look for ARG or ENV declarations patterns = [ r"ARG\s+MAD_SYSTEM_GPU_ARCHITECTURE=([^\s\n]+)", - r"ENV\s+MAD_SYSTEM_GPU_ARCHITECTURE=([^\s\n]+)" + r"ENV\s+MAD_SYSTEM_GPU_ARCHITECTURE=([^\s\n]+)", ] - + for pattern in patterns: match = re.search(pattern, content, re.IGNORECASE) if match: - return match.group(1).strip('"\'') + return match.group(1).strip("\"'") except Exception: pass - + return None def _infer_gpu_vendor_from_dockerfile(self, dockerfile: str) -> str: """Infer GPU vendor from dockerfile path. - + Args: dockerfile: Path to dockerfile (e.g., "docker/dummy.ubuntu.amd.Dockerfile") - + Returns: GPU vendor string: "AMD", "NVIDIA", or "" """ dockerfile_lower = dockerfile.lower() - + # Check for explicit vendor indicators in filename - if '.amd.' in dockerfile_lower or dockerfile_lower.endswith('.amd'): + if ".amd." in dockerfile_lower or dockerfile_lower.endswith(".amd"): return "AMD" - elif '.nvidia.' in dockerfile_lower or dockerfile_lower.endswith('.nvidia'): + elif ".nvidia." in dockerfile_lower or dockerfile_lower.endswith(".nvidia"): return "NVIDIA" - + # Try to infer from base image in Dockerfile try: - with open(dockerfile, 'r') as f: + with open(dockerfile, "r") as f: content = f.read() - + # Look for base image indicators - if 'FROM' in content: - if 'rocm' in content.lower() or 'amd' in content.lower(): + if "FROM" in content: + if "rocm" in content.lower() or "amd" in content.lower(): return "AMD" - elif 'nvidia' in content.lower() or 'cuda' in content.lower(): + elif "nvidia" in content.lower() or "cuda" in content.lower(): return "NVIDIA" except Exception: pass - + # Default to empty (legacy - will be treated as compatible with all) return "" @@ -728,13 +801,15 @@ def _create_base_image_name(self, model_info: typing.Dict, dockerfile: str) -> s """Create base image name from model info and dockerfile.""" # Extract dockerfile context suffix (e.g., "ubuntu.amd" from "dummy.ubuntu.amd.Dockerfile") dockerfile_name = os.path.basename(dockerfile) - if '.' in dockerfile_name: + if "." in dockerfile_name: # Remove the .Dockerfile extension and get context - context_parts = dockerfile_name.replace('.Dockerfile', '').split('.')[1:] # Skip model name - context_suffix = '.'.join(context_parts) if context_parts else 'default' + context_parts = dockerfile_name.replace(".Dockerfile", "").split(".")[ + 1: + ] # Skip model name + context_suffix = ".".join(context_parts) if context_parts else "default" else: - context_suffix = 'default' - + context_suffix = "default" + # Create base image name: ci-{model}_{model}.{context} return f"ci-{model_info['name']}_{model_info['name']}.{context_suffix}" @@ -822,55 +897,57 @@ def _determine_registry_image_name( return registry_image def _build_model_for_arch( - self, + self, model_info: typing.Dict, gpu_arch: str, credentials: typing.Dict, clean_cache: bool, registry: str, phase_suffix: str, - batch_build_metadata: typing.Optional[dict] + batch_build_metadata: typing.Optional[dict], ) -> typing.List[typing.Dict]: """Build model for specific GPU architecture with smart image naming.""" - + # Find dockerfiles dockerfiles = self._get_dockerfiles_for_model(model_info) - + arch_results = [] for dockerfile in dockerfiles: # When using --target-archs, always add architecture suffix regardless of GPU variables # This ensures consistent naming for multi-architecture builds base_image_name = self._create_base_image_name(model_info, dockerfile) arch_image_name = f"{base_image_name}_{gpu_arch}" - + # Set MAD_SYSTEM_GPU_ARCHITECTURE for this build arch_build_args = {"MAD_SYSTEM_GPU_ARCHITECTURE": gpu_arch} - + # Build the image build_info = self.build_image( - model_info, - dockerfile, + model_info, + dockerfile, credentials, - clean_cache, + clean_cache, phase_suffix, additional_build_args=arch_build_args, - override_image_name=arch_image_name + override_image_name=arch_image_name, ) - + # Add architecture metadata build_info["gpu_architecture"] = gpu_arch - + # Handle registry push with architecture-specific tagging if registry: registry_image = self._determine_registry_image_name( arch_image_name, registry, credentials ) try: - self.push_image(arch_image_name, registry, credentials, registry_image) + self.push_image( + arch_image_name, registry, credentials, registry_image + ) build_info["registry_image"] = registry_image except Exception as e: build_info["push_error"] = str(e) - + arch_results.append(build_info) - + return arch_results diff --git a/src/madengine/execution/dockerfile_utils.py b/src/madengine/execution/dockerfile_utils.py index c8392b5a..ee663d8f 100644 --- a/src/madengine/execution/dockerfile_utils.py +++ b/src/madengine/execution/dockerfile_utils.py @@ -32,7 +32,7 @@ def parse_dockerfile_gpu_variables( all_matches = arg_matches + env_matches if all_matches: - raw_value = all_matches[-1].strip('"\'') + raw_value = all_matches[-1].strip("\"'") parsed_values = parse_gpu_variable_value(var_name, raw_value) if parsed_values: gpu_variables[var_name] = parsed_values diff --git a/src/madengine/orchestration/__init__.py b/src/madengine/orchestration/__init__.py index e3dce29a..93c53110 100644 --- a/src/madengine/orchestration/__init__.py +++ b/src/madengine/orchestration/__init__.py @@ -13,4 +13,3 @@ from .run_orchestrator import RunOrchestrator __all__ = ["BuildOrchestrator", "RunOrchestrator"] - diff --git a/src/madengine/orchestration/build_orchestrator.py b/src/madengine/orchestration/build_orchestrator.py index d905f3b4..d78e08e8 100644 --- a/src/madengine/orchestration/build_orchestrator.py +++ b/src/madengine/orchestration/build_orchestrator.py @@ -16,21 +16,21 @@ from rich.console import Console as RichConsole from rich.panel import Panel -from madengine.core.console import Console -from madengine.core.context import Context from madengine.core.additional_context_defaults import apply_build_context_defaults from madengine.core.auth import load_credentials +from madengine.core.console import Console +from madengine.core.context import Context from madengine.core.errors import ( BuildError, ConfigurationError, DiscoveryError, create_error_context, ) -from madengine.utils.discover_models import DiscoverModels from madengine.execution.docker_builder import DockerBuilder from madengine.execution.dockerfile_utils import ( dockerfile_requires_explicit_mad_arch_build_arg, ) +from madengine.utils.discover_models import DiscoverModels class BuildOrchestrator: @@ -45,7 +45,12 @@ class BuildOrchestrator: - Save deployment_config from --additional-context """ - def __init__(self, args, additional_context: Optional[Dict] = None, detect_local_gpu_arch: bool = False): + def __init__( + self, + args, + additional_context: Optional[Dict] = None, + detect_local_gpu_arch: bool = False, + ): """ Initialize build orchestrator. @@ -63,7 +68,7 @@ def __init__(self, args, additional_context: Optional[Dict] = None, detect_local # Merge additional_context from args and parameter merged_context = {} - + # Load from file first if provided if hasattr(args, "additional_context_file") and args.additional_context_file: try: @@ -71,7 +76,7 @@ def __init__(self, args, additional_context: Optional[Dict] = None, detect_local merged_context = json.load(f) except (FileNotFoundError, json.JSONDecodeError) as e: print(f"Warning: Could not load additional_context_file: {e}") - + # Then merge string additional_context (overrides file) if hasattr(args, "additional_context") and args.additional_context: try: @@ -79,6 +84,7 @@ def __init__(self, args, additional_context: Optional[Dict] = None, detect_local # Use ast.literal_eval for Python dict syntax (single quotes) # This matches what Context class expects import ast + context_from_string = ast.literal_eval(args.additional_context) merged_context.update(context_from_string) elif isinstance(args.additional_context, dict): @@ -95,31 +101,42 @@ def __init__(self, args, additional_context: Optional[Dict] = None, detect_local apply_build_context_defaults(merged_context) self.additional_context = merged_context - + # Apply ConfigLoader to infer deploy type, validate, and apply defaults if self.additional_context: try: from madengine.deployment.config_loader import ConfigLoader + # This will: # 1. Infer deploy type from k8s/slurm presence # 2. Validate for conflicts (e.g., both k8s and slurm) # 3. Apply appropriate defaults # 4. Add 'deploy' field for internal use - self.additional_context = ConfigLoader.load_config(self.additional_context) + self.additional_context = ConfigLoader.load_config( + self.additional_context + ) except ValueError as e: # Re-raise as ConfigurationError so the CLI layer handles the exit code raise ConfigurationError(str(e)) except Exception as e: # Other errors during config loading - warn but continue - self.rich_console.print(f"[yellow]Warning: Could not apply config defaults: {e}[/yellow]") + self.rich_console.print( + f"[yellow]Warning: Could not apply config defaults: {e}[/yellow]" + ) self.rich_console.print("[bold blue]Build additional context[/bold blue]\n") - self.rich_console.print(Panel( - json.dumps(self.additional_context, indent=2) if self.additional_context else "(empty)", - title="[bold]Context[/bold] (from --additional-context / --additional-context-file)", - border_style="dim", - padding=(0, 1), - )) + self.rich_console.print( + Panel( + ( + json.dumps(self.additional_context, indent=2) + if self.additional_context + else "(empty)" + ), + title="[bold]Context[/bold] (from --additional-context / --additional-context-file)", + border_style="dim", + padding=(0, 1), + ) + ) self.rich_console.print() # Initialize context in build-only mode (no GPU detection by default). @@ -140,7 +157,7 @@ def __init__(self, args, additional_context: Optional[Dict] = None, detect_local def _copy_scripts(self): """[DEPRECATED] Copy common scripts to model directories. - + This method is no longer called during build phase as it's not needed. Build phase only creates Docker images - script execution happens in run phase. Scripts are copied by run_orchestrator._copy_scripts() for local execution. @@ -247,14 +264,18 @@ def execute( ) self._warn_if_mad_arch_unresolved_for_dockerfiles(models, builder) - resolved_arch = self.context.ctx.get("docker_build_arg", {}).get("MAD_SYSTEM_GPU_ARCHITECTURE") + resolved_arch = self.context.ctx.get("docker_build_arg", {}).get( + "MAD_SYSTEM_GPU_ARCHITECTURE" + ) if resolved_arch: self.rich_console.print( f"[green]✓ MAD_SYSTEM_GPU_ARCHITECTURE resolved: {resolved_arch}[/green]\n" ) # Step 3: Build Docker images - self.rich_console.print("[bold cyan]🏗️ Building Docker images...[/bold cyan]") + self.rich_console.print( + "[bold cyan]🏗️ Building Docker images...[/bold cyan]" + ) # Determine phase suffix for log files # Build phase always uses .build suffix to avoid conflicts with run logs @@ -302,8 +323,12 @@ def execute( self.rich_console.print(f" [red]• {model_name}: {error_msg}[/red]") # Step 4: ALWAYS generate manifest (even with partial failures) - self.rich_console.print("\n[bold cyan]📄 Generating build manifest...[/bold cyan]") - builder.export_build_manifest(manifest_output, registry, batch_build_metadata) + self.rich_console.print( + "\n[bold cyan]📄 Generating build manifest...[/bold cyan]" + ) + builder.export_build_manifest( + manifest_output, registry, batch_build_metadata + ) # Step 5: Save build summary to manifest self._save_build_summary(manifest_output, build_summary) @@ -311,7 +336,9 @@ def execute( # Step 6: Save deployment_config to manifest self._save_deployment_config(manifest_output) - self.rich_console.print(f"[green]✓ Build complete: {manifest_output}[/green]") + self.rich_console.print( + f"[green]✓ Build complete: {manifest_output}[/green]" + ) self.rich_console.print(f"[dim]{'=' * 60}[/dim]\n") # Step 7: Check if we should fail (only if ALL builds failed) @@ -369,12 +396,16 @@ def _save_build_summary(self, manifest_file: str, build_summary: Dict): json.dump(manifest, f, indent=2) except Exception as e: - self.rich_console.print(f"[yellow]Warning: Could not save build summary: {e}[/yellow]") + self.rich_console.print( + f"[yellow]Warning: Could not save build summary: {e}[/yellow]" + ) def _save_deployment_config(self, manifest_file: str): """Save deployment_config from --additional-context to manifest.""" if not self.additional_context: - self.rich_console.print("[dim]No additional_context provided, skipping deployment config[/dim]") + self.rich_console.print( + "[dim]No additional_context provided, skipping deployment config[/dim]" + ) return try: @@ -388,18 +419,22 @@ def _save_deployment_config(self, manifest_file: str): # Auto-detect based on config presence if self.additional_context.get("slurm"): target = "slurm" - elif self.additional_context.get("k8s") or self.additional_context.get("kubernetes"): + elif self.additional_context.get("k8s") or self.additional_context.get( + "kubernetes" + ): target = "k8s" else: target = "local" - + # Get env_vars and filter out MIOPEN_USER_DB_PATH # This variable must be set per-process in multi-GPU training to avoid database conflicts env_vars = self.additional_context.get("env_vars", {}).copy() if "MIOPEN_USER_DB_PATH" in env_vars: del env_vars["MIOPEN_USER_DB_PATH"] - print("ℹ️ Filtered MIOPEN_USER_DB_PATH from env_vars (will be set per-process in training)") - + print( + "ℹ️ Filtered MIOPEN_USER_DB_PATH from env_vars (will be set per-process in training)" + ) + deployment_config = { "target": target, "slurm": self.additional_context.get("slurm"), @@ -416,17 +451,25 @@ def _save_deployment_config(self, manifest_file: str): k: v for k, v in deployment_config.items() if v is not None } - if deployment_config and deployment_config != {"target": "local", "env_vars": {}}: + if deployment_config and deployment_config != { + "target": "local", + "env_vars": {}, + }: manifest["deployment_config"] = deployment_config with open(manifest_file, "w") as f: json.dump(manifest, f, indent=2) - self.rich_console.print(f"[green]✓ Saved deployment config to {manifest_file}[/green]") + self.rich_console.print( + f"[green]✓ Saved deployment config to {manifest_file}[/green]" + ) else: - self.rich_console.print("[dim]No deployment config to save (local execution)[/dim]") + self.rich_console.print( + "[dim]No deployment config to save (local execution)[/dim]" + ) except Exception as e: # Non-fatal - just warn - self.rich_console.print(f"[yellow]Warning: Could not save deployment config: {e}[/yellow]") - + self.rich_console.print( + f"[yellow]Warning: Could not save deployment config: {e}[/yellow]" + ) diff --git a/src/madengine/orchestration/image_filtering.py b/src/madengine/orchestration/image_filtering.py index 88a8725e..5026f35e 100644 --- a/src/madengine/orchestration/image_filtering.py +++ b/src/madengine/orchestration/image_filtering.py @@ -43,13 +43,19 @@ def filter_images_by_gpu_compatibility( compatible[model_name] = image_info else: skipped.append( - (model_name, f"architecture mismatch ({image_arch} != {runtime_gpu_arch})") + ( + model_name, + f"architecture mismatch ({image_arch} != {runtime_gpu_arch})", + ) ) else: compatible[model_name] = image_info else: skipped.append( - (model_name, f"GPU vendor mismatch ({image_gpu_vendor} != {runtime_gpu_vendor})") + ( + model_name, + f"GPU vendor mismatch ({image_gpu_vendor} != {runtime_gpu_vendor})", + ) ) return compatible, skipped diff --git a/src/madengine/orchestration/run_orchestrator.py b/src/madengine/orchestration/run_orchestrator.py index 6742b2a5..bdd7b13d 100644 --- a/src/madengine/orchestration/run_orchestrator.py +++ b/src/madengine/orchestration/run_orchestrator.py @@ -68,33 +68,44 @@ def __init__(self, args, additional_context: Optional[Dict] = None): # Use ast.literal_eval for Python dict syntax (single quotes) # This matches what Context class expects import ast + parsed = ast.literal_eval(args.additional_context) merged_context = parsed if isinstance(parsed, dict) else {} elif isinstance(args.additional_context, dict): merged_context = args.additional_context except (ValueError, SyntaxError) as e: - self.rich_console.print(f"[yellow]Warning: Could not parse additional_context: {e}[/yellow]") + self.rich_console.print( + f"[yellow]Warning: Could not parse additional_context: {e}[/yellow]" + ) if args.additional_context: - self.rich_console.print(f"[dim]Raw (first 200 chars): {str(args.additional_context)[:200]}[/dim]") + self.rich_console.print( + f"[dim]Raw (first 200 chars): {str(args.additional_context)[:200]}[/dim]" + ) pass if additional_context: merged_context.update(additional_context) self.additional_context = merged_context - keys_str = ", ".join(sorted(self.additional_context.keys())) if self.additional_context else "(none)" - self.rich_console.print(f"[dim]Run additional context (CLI):[/dim] [cyan]{keys_str}[/cyan]") + keys_str = ( + ", ".join(sorted(self.additional_context.keys())) + if self.additional_context + else "(none)" + ) + self.rich_console.print( + f"[dim]Run additional context (CLI):[/dim] [cyan]{keys_str}[/cyan]" + ) # Track if we copied MODEL_DIR contents (for cleanup) self._copied_from_model_dir = False - + # Track if we ran build phase in this workflow (for log combination) self._did_build_phase = False - + # Initialize session tracker for filtering current run results perf_csv_path = getattr(args, "output", "perf.csv") self.session_tracker = SessionTracker(perf_csv_path) - + # Initialize context in runtime mode (with GPU detection for local) # This will be lazy-initialized only when needed self.context = None @@ -104,14 +115,14 @@ def _init_runtime_context(self): """Initialize runtime context (with GPU detection).""" # Always reinitialize context in runtime mode for run phase # This ensures GPU detection and proper runtime context even after build phase - + # Context expects additional_context as a string representation of Python dict # Use repr() instead of json.dumps() because Context uses ast.literal_eval() if self.additional_context: context_string = repr(self.additional_context) else: context_string = None - + self.context = Context( additional_context=context_string, build_only_mode=False, @@ -171,7 +182,7 @@ def execute( mad_container_image = None if self.additional_context: mad_container_image = self.additional_context.get("MAD_CONTAINER_IMAGE") - + if mad_container_image: # Local image mode: Skip build, create synthetic manifest if not tags: @@ -186,14 +197,16 @@ def execute( "Example: --tags model_name --additional-context \"{'MAD_CONTAINER_IMAGE': 'rocm/tensorflow:latest'}\"", ], ) - + # Generate synthetic manifest using the provided image manifest_file = self._create_manifest_from_local_image( image_name=mad_container_image, tags=tags, - manifest_output=getattr(self.args, "manifest_output", "build_manifest.json"), + manifest_output=getattr( + self.args, "manifest_output", "build_manifest.json" + ), ) - + # Step 1: Ensure we have a manifest (build if needed) elif not manifest_file or not os.path.exists(manifest_file): if not tags: @@ -209,7 +222,9 @@ def execute( ], ) - self.rich_console.print("[cyan]No manifest found, building first...[/cyan]\n") + self.rich_console.print( + "[cyan]No manifest found, building first...[/cyan]\n" + ) manifest_file = self._build_phase(tags, registry) self._did_build_phase = True # Mark that we built in this workflow @@ -220,44 +235,66 @@ def execute( # (with optional runtime override) with open(manifest_file) as f: manifest = json.load(f) - + deployment_config = manifest.get("deployment_config", {}) - + # Update additional_context with deployment_config for deployment layer if not self.additional_context: self.additional_context = {} - + # Merge deployment_config into additional_context (for deployment layer to use) - for key in ["slurm", "k8s", "kubernetes", "distributed", "vllm", "env_vars", "debug"]: + for key in [ + "slurm", + "k8s", + "kubernetes", + "distributed", + "vllm", + "env_vars", + "debug", + ]: if key in deployment_config and key not in self.additional_context: self.additional_context[key] = deployment_config[key] - + # Display manifest entries: context (from build) and deployment_config (run/deploy) self.rich_console.print("[bold blue]Build manifest breakdown[/bold blue]\n") manifest_context = manifest.get("context", {}) - self.rich_console.print(Panel( - json.dumps(manifest_context, indent=2) if manifest_context else "(empty)", - title="[bold]Manifest context[/bold] (from build additional context)", - border_style="dim", - padding=(0, 1), - )) - self.rich_console.print(Panel( - json.dumps(deployment_config, indent=2) if deployment_config else "(empty)", - title="[bold]Manifest deployment_config[/bold]", - border_style="dim", - padding=(0, 1), - )) + self.rich_console.print( + Panel( + ( + json.dumps(manifest_context, indent=2) + if manifest_context + else "(empty)" + ), + title="[bold]Manifest context[/bold] (from build additional context)", + border_style="dim", + padding=(0, 1), + ) + ) + self.rich_console.print( + Panel( + ( + json.dumps(deployment_config, indent=2) + if deployment_config + else "(empty)" + ), + title="[bold]Manifest deployment_config[/bold]", + border_style="dim", + padding=(0, 1), + ) + ) self.rich_console.print() # Infer deployment target from config structure (Convention over Configuration) # No explicit "deploy" field needed - presence of k8s/slurm indicates deployment type target = self._infer_deployment_target(self.additional_context) - + # Legacy support: check manifest for explicit target if not target or target == "local": target = deployment_config.get("target", "local") - - self.rich_console.print(f"[bold cyan]Deployment target: {target}[/bold cyan]\n") + + self.rich_console.print( + f"[bold cyan]Deployment target: {target}[/bold cyan]\n" + ) # Use `is True` so MagicMock-based test doubles do not count as enabled. skip_requested = getattr(self.args, "skip_model_run", False) is True @@ -293,28 +330,34 @@ def execute( results = self._execute_local(manifest_file, timeout) else: results = self._execute_distributed(target, manifest_file) - + # Combine build and run logs for full workflow if self._did_build_phase and (target == "local" or target == "docker"): self._combine_build_and_run_logs(manifest_file) - + # Add session information to results for filtering results["session_start_row"] = session_start_row - results["session_row_count"] = self.session_tracker.get_session_row_count() - + results["session_row_count"] = ( + self.session_tracker.get_session_row_count() + ) + # Always cleanup madengine package files after execution - self.rich_console.print("\n[dim]🧹 Cleaning up madengine package files...[/dim]") + self.rich_console.print( + "\n[dim]🧹 Cleaning up madengine package files...[/dim]" + ) self._cleanup_model_dir_copies() - + # NOTE: Do NOT cleanup session marker here! # It's needed by display functions in CLI layer # Cleanup happens in CLI after display (via perf_csv_path) - + return results - + except Exception as e: # Always cleanup madengine package files even on error - self.rich_console.print("\n[dim]🧹 Cleaning up madengine package files...[/dim]") + self.rich_console.print( + "\n[dim]🧹 Cleaning up madengine package files...[/dim]" + ) self._cleanup_model_dir_copies() raise @@ -360,56 +403,63 @@ def _build_phase(self, tags: list, registry: Optional[str] = None) -> str: return manifest_file def _create_manifest_from_local_image( - self, - image_name: str, - tags: list, - manifest_output: str = "build_manifest.json" + self, image_name: str, tags: list, manifest_output: str = "build_manifest.json" ) -> str: """ Create a synthetic manifest for a user-provided local image. - + This enables MAD_CONTAINER_IMAGE functionality where users can skip the build phase and directly run models using a pre-existing Docker image. - + Args: image_name: Docker image name/tag (e.g., 'rocm/tensorflow:latest') tags: Model tags to discover manifest_output: Output path for the manifest file - + Returns: Path to the generated manifest file - + Raises: DiscoveryError: If no models are found RuntimeError: If image validation fails """ from madengine.utils.discover_models import DiscoverModels from madengine.core.errors import DiscoveryError - - self.rich_console.print(f"[yellow]🏠 Local Image Mode: Using {image_name}[/yellow]") - self.rich_console.print(f"[dim]Skipping build phase, creating synthetic manifest...[/dim]\n") - + + self.rich_console.print( + f"[yellow]🏠 Local Image Mode: Using {image_name}[/yellow]" + ) + self.rich_console.print( + f"[dim]Skipping build phase, creating synthetic manifest...[/dim]\n" + ) + # Validate that the image exists locally or can be pulled try: self.console.sh(f"docker image inspect {image_name} > /dev/null 2>&1") - self.rich_console.print(f"[green]✓ Image {image_name} found locally[/green]") + self.rich_console.print( + f"[green]✓ Image {image_name} found locally[/green]" + ) except (subprocess.CalledProcessError, RuntimeError) as e: - self.rich_console.print(f"[yellow]⚠️ Image {image_name} not found locally, attempting to pull...[/yellow]") + self.rich_console.print( + f"[yellow]⚠️ Image {image_name} not found locally, attempting to pull...[/yellow]" + ) try: self.console.sh(f"docker pull {image_name}") - self.rich_console.print(f"[green]✓ Successfully pulled {image_name}[/green]") + self.rich_console.print( + f"[green]✓ Successfully pulled {image_name}[/green]" + ) except Exception as e: raise RuntimeError( f"Failed to find or pull image {image_name}. " f"Ensure the image exists locally or can be pulled from a registry. " f"Error: {e}" ) - + # Discover models by tags (without building) self.args.tags = tags discover_models = DiscoverModels(args=self.args) models = discover_models.run() - + if not models: raise DiscoveryError( "No models discovered for local image mode", @@ -423,17 +473,21 @@ def _create_manifest_from_local_image( "Ensure model definitions have matching tags", ], ) - - self.rich_console.print(f"[green]✓ Discovered {len(models)} model(s) for tags: {tags}[/green]\n") - + + self.rich_console.print( + f"[green]✓ Discovered {len(models)} model(s) for tags: {tags}[/green]\n" + ) + # Initialize build-only context for manifest generation # (we need context structure, but skip GPU detection since we're not building) - context_string = repr(self.additional_context) if self.additional_context else None + context_string = ( + repr(self.additional_context) if self.additional_context else None + ) build_context = Context( additional_context=context_string, build_only_mode=True, ) - + # Create manifest structure manifest = { "built_images": {}, @@ -443,13 +497,13 @@ def _create_manifest_from_local_image( "local_image_name": image_name, "deployment_config": self.additional_context.get("deployment_config", {}), } - + # For each model, create a synthetic entry using the provided image for model in models: model_name = model["name"] # Create a synthetic image identifier (not an actual built image) synthetic_image_id = f"local-{model_name.replace('/', '_')}" - + manifest["built_images"][synthetic_image_id] = { "docker_image": image_name, # Use user-provided image "dockerfile": "N/A (local image mode)", @@ -458,22 +512,26 @@ def _create_manifest_from_local_image( "local_image": True, "registry_image": None, } - + # Convert data list to comma-separated string (required by dataprovider) data_field = model.get("data", []) if isinstance(data_field, list): data_str = ",".join(data_field) if data_field else "" else: data_str = data_field if data_field else "" - + # Build model info dict with all fields that ContainerRunner expects # Use exact field names from models.json format manifest["built_models"][synthetic_image_id] = { "name": model_name, "tags": model.get("tags", []), "dockerfile": "N/A (local image mode)", - "scripts": model.get("scripts", ""), # models.json uses "scripts" (plural) - "n_gpus": model.get("n_gpus", "1"), # models.json uses "n_gpus" (string format) + "scripts": model.get( + "scripts", "" + ), # models.json uses "scripts" (plural) + "n_gpus": model.get( + "n_gpus", "1" + ), # models.json uses "n_gpus" (string format) "owner": model.get("owner", ""), "training_precision": model.get("training_precision", ""), "args": model.get("args", ""), # Required field for docker run @@ -482,16 +540,22 @@ def _create_manifest_from_local_image( "cred": model.get("cred", ""), "deprecated": model.get("deprecated", False), "skip_gpu_arch": model.get("skip_gpu_arch", []), - "additional_docker_run_options": model.get("additional_docker_run_options", ""), + "additional_docker_run_options": model.get( + "additional_docker_run_options", "" + ), } - + # Write manifest to file with open(manifest_output, "w") as f: json.dump(manifest, f, indent=2) - - self.rich_console.print(f"[green]✓ Generated synthetic manifest: {manifest_output}[/green]") - self.rich_console.print(f"[yellow]⚠️ Warning: User-provided image {image_name}. Model support not guaranteed.[/yellow]\n") - + + self.rich_console.print( + f"[green]✓ Generated synthetic manifest: {manifest_output}[/green]" + ) + self.rich_console.print( + f"[yellow]⚠️ Warning: User-provided image {image_name}. Model support not guaranteed.[/yellow]\n" + ) + return manifest_output def _load_and_merge_manifest(self, manifest_file: str) -> str: @@ -510,22 +574,31 @@ def _load_and_merge_manifest(self, manifest_file: str) -> str: if "deployment_config" in manifest: stored_config = manifest["deployment_config"] # Runtime --additional-context overrides stored config - for key in ["deploy", "slurm", "k8s", "kubernetes", "distributed", "vllm", "env_vars", "debug"]: + for key in [ + "deploy", + "slurm", + "k8s", + "kubernetes", + "distributed", + "vllm", + "env_vars", + "debug", + ]: if key in self.additional_context: stored_config[key] = self.additional_context[key] manifest["deployment_config"] = stored_config - + # Merge context (tools, pre_scripts, post_scripts, encapsulate_script) if "context" not in manifest: manifest["context"] = {} - + merge_keys = ["tools", "pre_scripts", "post_scripts", "encapsulate_script"] context_updated = False for key in merge_keys: if key in self.additional_context: manifest["context"][key] = self.additional_context[key] context_updated = True - + if context_updated or "deployment_config" in manifest: # Write back merged config with open(manifest_file, "w") as f: @@ -541,16 +614,18 @@ def _execute_local(self, manifest_file: str, timeout: int) -> Dict: # Load manifest first to check if we have Docker images with open(manifest_file, "r") as f: manifest = json.load(f) - + has_docker_images = bool(manifest.get("built_images", {})) - + if has_docker_images: # Using Docker containers - containers have GPU support built-in - self.rich_console.print("[dim cyan]Using Docker containers with built-in GPU support[/dim cyan]\n") - + self.rich_console.print( + "[dim cyan]Using Docker containers with built-in GPU support[/dim cyan]\n" + ) + # Initialize runtime context (runs full GPU detection on compute nodes) self._init_runtime_context() - + # Show node info self._show_node_info() @@ -570,9 +645,14 @@ def _execute_local(self, manifest_file: str, timeout: int) -> Dict: if "post_scripts" in manifest_context: self.context.ctx["post_scripts"] = manifest_context["post_scripts"] if "encapsulate_script" in manifest_context: - self.context.ctx["encapsulate_script"] = manifest_context["encapsulate_script"] + self.context.ctx["encapsulate_script"] = manifest_context[ + "encapsulate_script" + ] # Restore docker_env_vars from build context (e.g. MAD_SECRET_HFTOKEN for Primus HF-backed configs) - if "docker_env_vars" in manifest_context and manifest_context["docker_env_vars"]: + if ( + "docker_env_vars" in manifest_context + and manifest_context["docker_env_vars"] + ): if "docker_env_vars" not in self.context.ctx: self.context.ctx["docker_env_vars"] = {} for k, v in manifest_context["docker_env_vars"].items(): @@ -589,9 +669,13 @@ def _execute_local(self, manifest_file: str, timeout: int) -> Dict: if "pre_scripts" in self.additional_context: self.context.ctx["pre_scripts"] = self.additional_context["pre_scripts"] if "post_scripts" in self.additional_context: - self.context.ctx["post_scripts"] = self.additional_context["post_scripts"] + self.context.ctx["post_scripts"] = self.additional_context[ + "post_scripts" + ] if "encapsulate_script" in self.additional_context: - self.context.ctx["encapsulate_script"] = self.additional_context["encapsulate_script"] + self.context.ctx["encapsulate_script"] = self.additional_context[ + "encapsulate_script" + ] # Filter images by GPU vendor and architecture # Filter images by GPU compatibility @@ -604,10 +688,14 @@ def _execute_local(self, manifest_file: str, timeout: int) -> Dict: if has_docker_images: # Docker images: filter by GPU vendor at runtime to avoid cross-vendor execution - self.rich_console.print("[dim cyan]Filtering Docker images by runtime GPU compatibility...[/dim cyan]") + self.rich_console.print( + "[dim cyan]Filtering Docker images by runtime GPU compatibility...[/dim cyan]" + ) else: # Bare-metal execution: filter by runtime GPU - self.rich_console.print("[dim cyan]Filtering bare-metal images by runtime GPU compatibility...[/dim cyan]") + self.rich_console.print( + "[dim cyan]Filtering bare-metal images by runtime GPU compatibility...[/dim cyan]" + ) compatible_images = self._filter_images_by_gpu_compatibility( manifest["built_images"], runtime_gpu_vendor, runtime_gpu_arch @@ -629,30 +717,37 @@ def _execute_local(self, manifest_file: str, timeout: int) -> Dict: manifest["built_images"] = compatible_images print(f"Filtered to {len(compatible_images)} compatible images\n") - + # Filter by skip_gpu_arch from model definitions (applies to both Docker and bare-metal) runtime_gpu_arch = self.context.get_system_gpu_architecture() if "built_models" in manifest and compatible_images: - self.rich_console.print("[cyan]Checking skip_gpu_arch model restrictions...[/cyan]") + self.rich_console.print( + "[cyan]Checking skip_gpu_arch model restrictions...[/cyan]" + ) compatible_images = self._filter_images_by_skip_gpu_arch( compatible_images, manifest["built_models"], runtime_gpu_arch ) manifest["built_images"] = compatible_images - print(f"After skip_gpu_arch filtering: {len(compatible_images)} images to run\n") - + print( + f"After skip_gpu_arch filtering: {len(compatible_images)} images to run\n" + ) + # NOTE: Dockerfile context filtering is already done during build phase # Re-filtering during run phase causes issues because: # 1. The build phase already filtered dockerfiles based on build-time context # 2. All built images should be runnable on the runtime node # 3. Legacy behavior: filtering happens once (either build or run, not both) - + # Write filtered manifest back to file so runner sees the filtered list with open(manifest_file, "w") as f: json.dump(manifest, f, indent=2) except Exception as e: import traceback - self.rich_console.print(f"[yellow]Warning: GPU/Context filtering failed: {e}[/yellow]") + + self.rich_console.print( + f"[yellow]Warning: GPU/Context filtering failed: {e}[/yellow]" + ) self.rich_console.print(f"[red]Traceback: {traceback.format_exc()}[/red]") self.rich_console.print("[yellow]Proceeding with all images[/yellow]\n") @@ -701,8 +796,10 @@ def _execute_distributed(self, target: str, manifest_file: str) -> Dict: # Add runtime flags to additional_context for deployment layer if "live_output" not in self.additional_context: - self.additional_context["live_output"] = getattr(self.args, "live_output", False) - + self.additional_context["live_output"] = getattr( + self.args, "live_output", False + ) + # Pass session_start_row for result filtering in collect_results session_start_row = self.session_tracker.session_start_row if "session_start_row" not in self.additional_context: @@ -757,37 +854,39 @@ def _show_node_info(self): elif "HOST_AZURE" in host_os: print(self.console.sh("tdnf info rocm-libs", canFail=True)) else: - self.rich_console.print("[yellow]Warning: Unable to detect host OS[/yellow]") + self.rich_console.print( + "[yellow]Warning: Unable to detect host OS[/yellow]" + ) def _cleanup_model_dir_copies(self): """Clean up only madengine package files from scripts/common directory. - + This cleanup removes ONLY the files that were copied from madengine package: - scripts/common/tools.json - scripts/common/test_echo.sh - scripts/common/pre_scripts/ - scripts/common/post_scripts/ - scripts/common/tools/ - + This preserves the user's actual scripts/ and docker/ directories in MAD project. """ import shutil import subprocess - + # Only clean up scripts/common/ subdirectories that came from madengine package common_dir = Path("scripts/common") if not common_dir.exists(): return - + # List of items to clean up (from madengine package) items_to_cleanup = [ "tools.json", "test_echo.sh", "pre_scripts", "post_scripts", - "tools" + "tools", ] - + for item_name in items_to_cleanup: item_path = common_dir / item_name if item_path.exists(): @@ -798,14 +897,20 @@ def _cleanup_model_dir_copies(self): subprocess.run( ["chmod", "-R", "+w", str(item_path)], capture_output=True, - timeout=10 + timeout=10, ) - except (subprocess.TimeoutExpired, subprocess.CalledProcessError, OSError) as e: + except ( + subprocess.TimeoutExpired, + subprocess.CalledProcessError, + OSError, + ) as e: print(f"Warning: chmod failed for {item_path}: {e}") shutil.rmtree(item_path) else: item_path.unlink() - self.rich_console.print(f"[dim] Cleaned up: scripts/common/{item_name}[/dim]") + self.rich_console.print( + f"[dim] Cleaned up: scripts/common/{item_name}[/dim]" + ) except Exception as e: # Try with sudo for permission issues try: @@ -813,9 +918,11 @@ def _cleanup_model_dir_copies(self): ["sudo", "rm", "-rf", str(item_path)], check=True, capture_output=True, - timeout=10 + timeout=10, + ) + self.rich_console.print( + f"[dim] Cleaned up: scripts/common/{item_name} (elevated)[/dim]" ) - self.rich_console.print(f"[dim] Cleaned up: scripts/common/{item_name} (elevated)[/dim]") except Exception as e2: self.rich_console.print( f"[yellow]⚠️ Warning: Could not clean up {item_path}: {e2}[/yellow]" @@ -823,84 +930,88 @@ def _cleanup_model_dir_copies(self): def _combine_build_and_run_logs(self, manifest_file: str): """Combine build.live.log and run.live.log into live.log for full workflow. - + For full workflow (build + run), this creates a unified log file by: 1. Reading the manifest to find models that were actually executed in this session 2. Finding corresponding *.build.live.log and *.run.live.log files for those models 3. Concatenating them into *.live.log 4. Keeping the original build and run logs for reference - + Args: manifest_file: Path to the manifest file containing executed models """ import json - + # Load manifest to get list of build log files try: with open(manifest_file, "r") as f: manifest = json.load(f) - + built_images = manifest.get("built_images", {}) if not built_images: return # No models to process except Exception as e: - self.rich_console.print(f"[yellow]⚠️ Warning: Could not load manifest for log combining: {e}[/yellow]") + self.rich_console.print( + f"[yellow]⚠️ Warning: Could not load manifest for log combining: {e}[/yellow]" + ) return - + self.rich_console.print("\n[dim]📝 Combining build and run logs...[/dim]") combined_count = 0 - + # Process each built image for image_name, image_info in built_images.items(): # Get build log file name from manifest build_log = image_info.get("log_file") if not build_log or not os.path.exists(build_log): continue # Skip if build log doesn't exist - + # Derive the base name and corresponding run log base_name = build_log.replace(".build.live.log", "") run_log = f"{base_name}.run.live.log" combined_log = f"{base_name}.live.log" - + # Check if run log exists if not os.path.exists(run_log): continue # Skip if run log doesn't exist - + try: # Combine build and run logs - with open(combined_log, 'w') as outfile: + with open(combined_log, "w") as outfile: # Add build log - with open(build_log, 'r') as infile: + with open(build_log, "r") as infile: outfile.write(infile.read()) - + # Add separator outfile.write("\n" + "=" * 80 + "\n") outfile.write("RUN PHASE LOG\n") outfile.write("=" * 80 + "\n\n") - + # Add run log - with open(run_log, 'r') as infile: + with open(run_log, "r") as infile: outfile.write(infile.read()) - + combined_count += 1 self.rich_console.print(f"[dim] Combined: {combined_log}[/dim]") - + except Exception as e: self.rich_console.print( f"[yellow]⚠️ Warning: Could not combine logs for {base_name}: {e}[/yellow]" ) - + if combined_count > 0: - self.rich_console.print(f"[dim]✓ Combined {combined_count} log file(s)[/dim]") + self.rich_console.print( + f"[dim]✓ Combined {combined_count} log file(s)[/dim]" + ) def _copy_scripts(self): """Copy common scripts to model directories. - + Handles scenarios: 1. MAD Project: scripts/ already exists in current directory - just add madengine common files 2. External MODEL_DIR: Copy from external path to current directory 3. madengine Testing: Copy from src/madengine/scripts/common - + NOTE: Does NOT delete existing scripts/ or docker/ directories in current working directory. """ import shutil @@ -908,19 +1019,27 @@ def _copy_scripts(self): # Define ignore function for cache files (used for all copy operations) def ignore_cache_files(directory, files): """Ignore Python cache files and directories.""" - return [f for f in files if f.endswith('.pyc') or f == '__pycache__' or f.endswith('.pyo')] - + return [ + f + for f in files + if f.endswith(".pyc") or f == "__pycache__" or f.endswith(".pyo") + ] + # Step 1: Check if MODEL_DIR points to external directory and copy if needed # MODEL_DIR default is "." (current directory), so only copy if it's different model_dir_env = os.environ.get("MODEL_DIR", ".") model_dir_abs = os.path.abspath(model_dir_env) current_dir_abs = os.path.abspath(".") - + # Only copy if MODEL_DIR points to a different directory (not current dir) if model_dir_abs != current_dir_abs and os.path.exists(model_dir_env): - self.rich_console.print(f"[yellow]📁 External MODEL_DIR detected: {model_dir_env}[/yellow]") - self.rich_console.print("[yellow]Copying MODEL_DIR contents for run phase...[/yellow]") - + self.rich_console.print( + f"[yellow]📁 External MODEL_DIR detected: {model_dir_env}[/yellow]" + ) + self.rich_console.print( + "[yellow]Copying MODEL_DIR contents for run phase...[/yellow]" + ) + # Copy docker/ and scripts/ from MODEL_DIR (without deleting existing ones first) for subdir in ["docker", "scripts"]: src_path = Path(model_dir_env) / subdir @@ -929,18 +1048,29 @@ def ignore_cache_files(directory, files): # Use copytree with dirs_exist_ok=True to merge instead of replace if dest_path.exists(): # Only warn, don't delete existing directories - self.rich_console.print(f"[dim] Note: Merging {subdir}/ from MODEL_DIR with existing directory[/dim]") - shutil.copytree(src_path, dest_path, dirs_exist_ok=True, ignore=ignore_cache_files) - - self.rich_console.print("[green]✓ MODEL_DIR structure copied (docker/, scripts/)[/green]") + self.rich_console.print( + f"[dim] Note: Merging {subdir}/ from MODEL_DIR with existing directory[/dim]" + ) + shutil.copytree( + src_path, + dest_path, + dirs_exist_ok=True, + ignore=ignore_cache_files, + ) + + self.rich_console.print( + "[green]✓ MODEL_DIR structure copied (docker/, scripts/)[/green]" + ) elif not os.path.exists(model_dir_env): - self.rich_console.print(f"[yellow]⚠️ Warning: MODEL_DIR '{model_dir_env}' does not exist, using current directory[/yellow]") + self.rich_console.print( + f"[yellow]⚠️ Warning: MODEL_DIR '{model_dir_env}' does not exist, using current directory[/yellow]" + ) # Step 2: Copy madengine's common scripts (pre_scripts, post_scripts, tools) # This provides the execution framework scripts # Find madengine installation path (works for both development and installed package) madengine_common = None - + # Option 1: Development mode - check if running from source dev_path = Path("src/madengine/scripts/common") if dev_path.exists(): @@ -950,23 +1080,34 @@ def ignore_cache_files(directory, files): # Option 2: Installed package - find via module location try: import madengine + madengine_module_path = Path(madengine.__file__).parent installed_path = madengine_module_path / "scripts" / "common" if installed_path.exists(): madengine_common = installed_path - print(f"Found madengine scripts in installed package: {madengine_common}") + print( + f"Found madengine scripts in installed package: {madengine_common}" + ) except Exception as e: print(f"Could not locate madengine scripts: {e}") - + if madengine_common and madengine_common.exists(): - print(f"Copying madengine common scripts from {madengine_common} to scripts/common") - + print( + f"Copying madengine common scripts from {madengine_common} to scripts/common" + ) + dest_common = Path("scripts/common") # Ensure the destination directory exists before copying dest_common.mkdir(parents=True, exist_ok=True) - + # Copy pre_scripts, post_scripts, tools if they exist - for item in ["pre_scripts", "post_scripts", "tools", "tools.json", "test_echo.sh"]: + for item in [ + "pre_scripts", + "post_scripts", + "tools", + "tools.json", + "test_echo.sh", + ]: src_item = madengine_common / item if src_item.exists(): dest_item = dest_common / item @@ -975,19 +1116,21 @@ def ignore_cache_files(directory, files): shutil.rmtree(dest_item) else: dest_item.unlink() - + if src_item.is_dir(): shutil.copytree(src_item, dest_item, ignore=ignore_cache_files) else: shutil.copy2(src_item, dest_item) print(f" Copied {item}") else: - self.rich_console.print("[yellow]⚠️ Could not find madengine scripts directory[/yellow]") + self.rich_console.print( + "[yellow]⚠️ Could not find madengine scripts directory[/yellow]" + ) # Step 3: REMOVED - Distribution to model directories is incorrect # scripts/common should remain at /scripts/common/ for proper relative path access # Model scripts reference it via ../scripts/common/ from their directory (e.g., scripts/dummy/) - # + # # This ensures compatibility with legacy workflow where: # - scripts/common/ stays at working directory root # - Model scripts use ../scripts/common/ relative paths @@ -1008,7 +1151,9 @@ def _filter_images_by_gpu_compatibility( ) compatible_images[model_name] = image_info continue - built_with_vendor = {k: v for k, v in built_images.items() if v.get("gpu_vendor")} + built_with_vendor = { + k: v for k, v in built_images.items() if v.get("gpu_vendor") + } compat, skipped = _filter_by_gpu_compat( built_with_vendor, runtime_gpu_vendor, runtime_gpu_arch ) @@ -1016,7 +1161,7 @@ def _filter_images_by_gpu_compatibility( for model_name, reason in skipped: self.rich_console.print(f"[dim] Skipping {model_name}: {reason}[/dim]") return compatible_images - + def _filter_images_by_gpu_architecture( self, built_images: Dict, runtime_gpu_arch: str ) -> Dict: @@ -1047,9 +1192,11 @@ def _filter_images_by_skip_gpu_arch( self._write_skipped_status(model_name, image_info, gpu_arch) return compatible_images - def _write_skipped_status(self, model_name: str, image_info: Dict, gpu_arch: str) -> None: + def _write_skipped_status( + self, model_name: str, image_info: Dict, gpu_arch: str + ) -> None: """Write SKIPPED status to perf CSV for models that were skipped. - + Args: model_name: Name of the model that was skipped image_info: Image information dictionary @@ -1059,7 +1206,7 @@ def _write_skipped_status(self, model_name: str, image_info: Dict, gpu_arch: str from madengine.reporting.update_perf_csv import update_perf_csv import json import tempfile - + # Create a perf entry for the skipped model perf_entry = { "model": model_name, @@ -1067,37 +1214,42 @@ def _write_skipped_status(self, model_name: str, image_info: Dict, gpu_arch: str "reason": f"Model not supported on {gpu_arch} architecture", "gpu_architecture": gpu_arch, } - + # Write to temporary JSON file - with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f: + with tempfile.NamedTemporaryFile( + mode="w", suffix=".json", delete=False + ) as f: json.dump(perf_entry, f) temp_file = f.name - + # Get output CSV path from args - output_csv = getattr(self.args, 'output', 'perf.csv') - + output_csv = getattr(self.args, "output", "perf.csv") + # Update perf CSV with skipped entry update_perf_csv(exception_result=temp_file, perf_csv=output_csv) - + # Clean up temp file import os + os.unlink(temp_file) - + except Exception as e: - self.rich_console.print(f"[dim] Warning: Could not write SKIPPED status to CSV: {e}[/dim]") + self.rich_console.print( + f"[dim] Warning: Could not write SKIPPED status to CSV: {e}[/dim]" + ) def _infer_deployment_target(self, config: Dict) -> str: """ Infer deployment target from configuration structure. - + Convention over Configuration: - Presence of "k8s" or "kubernetes" field → k8s deployment - Presence of "slurm" field → slurm deployment - Neither present → local execution - + Args: config: Configuration dictionary - + Returns: Deployment target: "k8s", "slurm", or "local" """ @@ -1107,5 +1259,3 @@ def _infer_deployment_target(self, config: Dict) -> str: return "slurm" else: return "local" - - diff --git a/src/madengine/reporting/__init__.py b/src/madengine/reporting/__init__.py index af8ef4ae..8152a5cf 100644 --- a/src/madengine/reporting/__init__.py +++ b/src/madengine/reporting/__init__.py @@ -4,11 +4,11 @@ Reporting modules for madengine including performance CSV and superset generation. """ -from .update_perf_csv import PERF_CSV_HEADER, update_perf_csv, flatten_tags +from .update_perf_csv import PERF_CSV_HEADER, flatten_tags, update_perf_csv from .update_perf_super import ( - update_perf_super_json, - update_perf_super_csv, convert_super_json_to_csv, + update_perf_super_csv, + update_perf_super_json, ) __all__ = [ @@ -19,4 +19,3 @@ "update_perf_super_csv", "convert_super_json_to_csv", ] - diff --git a/src/madengine/reporting/csv_to_email.py b/src/madengine/reporting/csv_to_email.py index 4b21bc17..91835b92 100644 --- a/src/madengine/reporting/csv_to_email.py +++ b/src/madengine/reporting/csv_to_email.py @@ -6,9 +6,9 @@ Copyright (c) Advanced Micro Devices, Inc. All rights reserved. """ -import os import argparse import logging +import os from typing import List, Optional, Tuple import pandas as pd @@ -27,7 +27,7 @@ def find_csv_files(directory: str) -> List[str]: """ csv_files = [] for filename in os.listdir(directory): - if filename.endswith('.csv'): + if filename.endswith(".csv"): csv_files.append(os.path.join(directory, filename)) return sorted(csv_files) @@ -43,23 +43,22 @@ def csv_to_html_section(file_path: str) -> Tuple[str, str]: """ # Read the CSV file df = pd.read_csv(file_path) - + # Get section name from file path base_name = os.path.basename(file_path) section_name = os.path.splitext(base_name)[0] - + # Convert DataFrame to HTML html_table = df.to_html(index=False) - + # Create HTML section with header html_section = f"

{section_name}

\n{html_table}\n" - + return section_name, html_section def convert_directory_csvs_to_html( - directory_path: str, - output_file: str = "run_results.html" + directory_path: str, output_file: str = "run_results.html" ) -> Optional[str]: """Convert all CSV files in a directory to a single HTML file. @@ -77,20 +76,20 @@ def convert_directory_csvs_to_html( # Validate input if not os.path.exists(directory_path): raise FileNotFoundError(f"Directory not found: {directory_path}") - + if not os.path.isdir(directory_path): raise NotADirectoryError(f"Path is not a directory: {directory_path}") # Find all CSV files csv_files = find_csv_files(directory_path) - + if not csv_files: logger.warning(f"No CSV files found in directory: {directory_path}") print(f"⚠️ No CSV files found in {directory_path}") return None print(f"📊 Found {len(csv_files)} CSV file(s) to process") - + # Process each CSV file and combine HTML full_html_content = "" for csv_file in csv_files: @@ -104,18 +103,22 @@ def convert_directory_csvs_to_html( print(f" ✗ Failed to convert {os.path.basename(csv_file)}: {e}") # Write combined HTML to output file - output_path = os.path.join(directory_path, output_file) if directory_path != "." else output_file - - with open(output_path, 'w', encoding='utf-8') as html_file: + output_path = ( + os.path.join(directory_path, output_file) + if directory_path != "." + else output_file + ) + + with open(output_path, "w", encoding="utf-8") as html_file: html_file.write(full_html_content) - + logger.info(f"Generated HTML report: {output_path}") return output_path class ConvertCsvToEmail: """Handler class for CSV to email-ready HTML conversion command. - + This class provides a command-line interface wrapper for converting multiple CSV files in a directory to a consolidated HTML report. """ @@ -131,13 +134,13 @@ def __init__(self, args: argparse.Namespace): def run(self) -> bool: """Execute the CSV to email HTML conversion. - + Returns: True if conversion was successful, False otherwise. """ - directory_path = getattr(self.args, 'csv_file_path', '.') or '.' - output_file = getattr(self.args, 'output_file', 'run_results.html') - + directory_path = getattr(self.args, "csv_file_path", ".") or "." + output_file = getattr(self.args, "output_file", "run_results.html") + print("\n" + "=" * 80) print("📧 CONVERTING CSV FILES TO EMAIL REPORT") print("=" * 80) @@ -145,13 +148,13 @@ def run(self) -> bool: try: output_path = convert_directory_csvs_to_html(directory_path, output_file) - + if output_path: print(f"📄 Output file: {output_path}") print("✅ Email report generated successfully") else: print("ℹ️ No files to process") - + print("=" * 80 + "\n") self.return_status = True except (FileNotFoundError, NotADirectoryError) as e: @@ -165,4 +168,3 @@ def run(self) -> bool: self.return_status = False return self.return_status - diff --git a/src/madengine/reporting/csv_to_html.py b/src/madengine/reporting/csv_to_html.py index baf7a027..6c23695d 100644 --- a/src/madengine/reporting/csv_to_html.py +++ b/src/madengine/reporting/csv_to_html.py @@ -6,9 +6,9 @@ Copyright (c) Advanced Micro Devices, Inc. All rights reserved. """ -import os import argparse import logging +import os from typing import Optional import pandas as pd @@ -17,9 +17,7 @@ def convert_csv_to_html( - file_path: str, - output_path: Optional[str] = None, - include_index: bool = False + file_path: str, output_path: Optional[str] = None, include_index: bool = False ) -> str: """Convert a CSV file to an HTML file. @@ -39,8 +37,8 @@ def convert_csv_to_html( # Validate input if not os.path.exists(file_path): raise FileNotFoundError(f"CSV file not found: {file_path}") - - if not file_path.endswith('.csv'): + + if not file_path.endswith(".csv"): raise ValueError(f"File must be a CSV file: {file_path}") # Determine output path @@ -48,8 +46,12 @@ def convert_csv_to_html( base_path = os.path.dirname(file_path) base_name = os.path.basename(file_path) file_name = os.path.splitext(base_name)[0] - - output_path = os.path.join(base_path, f"{file_name}.html") if base_path else f"{file_name}.html" + + output_path = ( + os.path.join(base_path, f"{file_name}.html") + if base_path + else f"{file_name}.html" + ) # Read CSV file logger.info(f"Reading CSV file: {file_path}") @@ -63,6 +65,7 @@ def convert_csv_to_html( file_name = os.path.splitext(os.path.basename(file_path))[0] try: from madengine.utils.log_formatting import print_dataframe_beautiful + print_dataframe_beautiful(df, f"Converting CSV: {file_name}") except ImportError: # Fallback to basic formatting if utils not available @@ -74,9 +77,9 @@ def convert_csv_to_html( # Convert DataFrame to HTML logger.info(f"Converting to HTML: {output_path}") df_html = df.to_html(index=include_index) - + # Write HTML file - with open(output_path, 'w', encoding='utf-8') as html_file: + with open(output_path, "w", encoding="utf-8") as html_file: html_file.write(df_html) logger.info(f"✅ Successfully converted {file_path} to {output_path}") @@ -85,7 +88,7 @@ def convert_csv_to_html( class ConvertCsvToHtml: """Handler class for CSV to HTML conversion command. - + This class provides a command-line interface wrapper for converting CSV files to HTML format. """ @@ -101,12 +104,12 @@ def __init__(self, args: argparse.Namespace): def run(self) -> bool: """Execute the CSV to HTML conversion. - + Returns: True if conversion was successful, False otherwise. """ file_path = self.args.csv_file_path - + print("\n" + "=" * 80) print("🔄 CONVERTING CSV TO HTML REPORT") print("=" * 80) @@ -133,4 +136,3 @@ def run(self) -> bool: self.return_status = False return self.return_status - diff --git a/src/madengine/reporting/update_perf_csv.py b/src/madengine/reporting/update_perf_csv.py index f298efa2..68aff238 100644 --- a/src/madengine/reporting/update_perf_csv.py +++ b/src/madengine/reporting/update_perf_csv.py @@ -111,10 +111,12 @@ def handle_multiple_results( # Check that the multiple results CSV has the following required columns: # model, performance, metric - headings = ['model', 'performance', 'metric'] + headings = ["model", "performance", "metric"] for heading in headings: - if not(heading in multiple_results_header): - raise RuntimeError(multiple_results + " file is missing the " + heading + " column") + if not (heading in multiple_results_header): + raise RuntimeError( + multiple_results + " file is missing the " + heading + " column" + ) common_info_json = read_json(common_info) flatten_tags(common_info_json) @@ -125,7 +127,7 @@ def handle_multiple_results( row = common_info_json.copy() model = r.pop("model") row["model"] = model_name + "_" + str(model) - + # Extract all columns from CSV result to ensure proper column alignment # This ensures all result columns (benchmark, tp, inp, out, dtype, etc.) are captured for key, value in r.items(): @@ -140,7 +142,7 @@ def handle_multiple_results( for key, value in row.items(): if isinstance(value, (list, tuple)): row[key] = ",".join(str(v) for v in value) - + # Create a single-row DataFrame from the row dict row_df = pd.DataFrame([row]) final_multiple_results_df = pd.concat( @@ -152,24 +154,28 @@ def handle_multiple_results( desired_columns = perf_csv_df.columns.tolist() # Add any additional columns from final_multiple_results_df desired_columns = desired_columns + [ - col for col in final_multiple_results_df.columns if col not in desired_columns + col + for col in final_multiple_results_df.columns + if col not in desired_columns ] # Only select columns that actually exist in final_multiple_results_df to avoid KeyError - available_columns = [col for col in desired_columns if col in final_multiple_results_df.columns] + available_columns = [ + col for col in desired_columns if col in final_multiple_results_df.columns + ] final_multiple_results_df = final_multiple_results_df[available_columns] perf_entry_df_to_csv(final_multiple_results_df) - + # Also save as JSON for consistency with single result workflow # This ensures perf_entry.json is always up-to-date regardless of result type - perf_entry_list = final_multiple_results_df.to_dict(orient='records') + perf_entry_list = final_multiple_results_df.to_dict(orient="records") with open("perf_entry.json", "w") as f: # If multiple entries, save as array; if single, save as object for consistency if len(perf_entry_list) == 1: json.dump(perf_entry_list[0], f, indent=2) else: json.dump(perf_entry_list, f, indent=2) - + if perf_csv_df.empty: perf_csv_df = final_multiple_results_df else: diff --git a/src/madengine/reporting/update_perf_super.py b/src/madengine/reporting/update_perf_super.py index f0b1753c..f4bd9433 100644 --- a/src/madengine/reporting/update_perf_super.py +++ b/src/madengine/reporting/update_perf_super.py @@ -11,48 +11,50 @@ import json import os import typing + # third-party imports import pandas as pd + # MAD Engine imports from madengine.utils.config_parser import ConfigParser def read_json(js: str) -> typing.Union[dict, list]: """Read a JSON file. - + Args: js: The path to the JSON file. - + Returns: The JSON dictionary or list. """ - with open(js, 'r') as f: + with open(js, "r") as f: return json.load(f) def write_json(data: typing.Union[dict, list], output_path: str) -> None: """Write data to a JSON file. - + Args: data: The data to write (dict or list). output_path: The path to the output JSON file. """ - with open(output_path, 'w') as f: + with open(output_path, "w") as f: json.dump(data, f, indent=2) def load_perf_super_json(perf_super_json: str) -> list: """Load existing perf_super.json file (cumulative). - + Args: perf_super_json: Path to perf_super.json file. - + Returns: List of performance records, or empty list if file doesn't exist. """ if not os.path.exists(perf_super_json): return [] - + try: data = read_json(perf_super_json) # Ensure it's a list @@ -66,85 +68,85 @@ def load_perf_super_json(perf_super_json: str) -> list: def handle_multiple_results_super( - perf_super_list: list, - multiple_results: str, - common_info: str, - model_name: str, - config_parser: ConfigParser - ) -> list: + perf_super_list: list, + multiple_results: str, + common_info: str, + model_name: str, + config_parser: ConfigParser, +) -> list: """Handle multiple results with config matching. - + Args: perf_super_list: List of existing performance records. multiple_results: The path to the multiple results CSV file. common_info: The path to the common info JSON file. model_name: The model name. config_parser: ConfigParser instance for loading configs. - + Returns: Updated list of performance records with configs. """ # Load multiple results CSV multiple_results_df = pd.read_csv(multiple_results) multiple_results_df.columns = multiple_results_df.columns.str.strip() - + # Check required columns - required_cols = ['model', 'performance', 'metric'] + required_cols = ["model", "performance", "metric"] for col in required_cols: if col not in multiple_results_df.columns: raise RuntimeError(f"{multiple_results} file is missing the {col} column") - + # Load common info common_info_json = read_json(common_info) - + # Parse config file from args if present configs_data = None - if 'args' in common_info_json and common_info_json['args']: + if "args" in common_info_json and common_info_json["args"]: # model_scripts_path: use None so resolution relies on config_parser.scripts_base_dir # (callers pass scripts_base_dir when creating the parser; 'pipeline' is not a path) - configs_data = config_parser.parse_and_load( - common_info_json['args'], - None - ) - + configs_data = config_parser.parse_and_load(common_info_json["args"], None) + # Process each result row for result_row in multiple_results_df.to_dict(orient="records"): record = common_info_json.copy() - + # Update model name result_model = result_row.pop("model") record["model"] = f"{model_name}_{result_model}" - + # Extract standard performance/metric columns record["performance"] = result_row.pop("performance") record["metric"] = result_row.pop("metric") # test_duration for Duration column in reports (avoid N/A when CSV has it) _td = result_row.pop("test_duration", "") - record["test_duration"] = "" if (_td is None or _td == "" or pd.isna(_td)) else str(_td) + record["test_duration"] = ( + "" if (_td is None or _td == "" or pd.isna(_td)) else str(_td) + ) # Put remaining metrics into multi_results # Exclude internal fields that shouldn't be in multi_results - extra_metrics = {k: v for k, v in result_row.items() - if k not in ["status"] and pd.notna(v)} + extra_metrics = { + k: v for k, v in result_row.items() if k not in ["status"] and pd.notna(v) + } if extra_metrics: record["multi_results"] = extra_metrics else: record["multi_results"] = None - + # Set status based on performance - if record.get("performance") is not None and pd.notna(record.get("performance")): + if record.get("performance") is not None and pd.notna( + record.get("performance") + ): record["status"] = "SUCCESS" else: record["status"] = "FAILURE" - + # Match config to this specific result if configs_data: if isinstance(configs_data, list): # For CSV configs with multiple rows, try to match matched_config = config_parser.match_config_to_result( - configs_data, - result_row, - result_model + configs_data, result_row, result_model ) record["configs"] = matched_config else: @@ -152,77 +154,71 @@ def handle_multiple_results_super( record["configs"] = configs_data else: record["configs"] = None - + perf_super_list.append(record) - + return perf_super_list -def handle_single_result_super( - perf_super_list: list, - single_result: str - ) -> list: +def handle_single_result_super(perf_super_list: list, single_result: str) -> list: """Handle a single result. - + Args: perf_super_list: List of existing performance records. single_result: The path to the single result JSON file. - + Returns: Updated list of performance records. """ single_result_json = read_json(single_result) - + # Ensure configs field exists (may be None) if "configs" not in single_result_json: single_result_json["configs"] = None - + # Ensure multi_results field exists (may be None) if "multi_results" not in single_result_json: single_result_json["multi_results"] = None - + perf_super_list.append(single_result_json) return perf_super_list -def handle_exception_result_super( - perf_super_list: list, - exception_result: str - ) -> list: +def handle_exception_result_super(perf_super_list: list, exception_result: str) -> list: """Handle an exception result. - + Args: perf_super_list: List of existing performance records. exception_result: The path to the exception result JSON file. - + Returns: Updated list of performance records. """ exception_result_json = read_json(exception_result) - + # Ensure configs field exists (may be None) if "configs" not in exception_result_json: exception_result_json["configs"] = None - + # Ensure multi_results field exists (may be None) if "multi_results" not in exception_result_json: exception_result_json["multi_results"] = None - + perf_super_list.append(exception_result_json) return perf_super_list def update_perf_super_json( - perf_super_json: str, - multiple_results: typing.Optional[str] = None, - single_result: typing.Optional[str] = None, - exception_result: typing.Optional[str] = None, - common_info: typing.Optional[str] = None, - model_name: typing.Optional[str] = None, - scripts_base_dir: typing.Optional[str] = None, - ) -> int: + perf_super_json: str, + multiple_results: typing.Optional[str] = None, + single_result: typing.Optional[str] = None, + exception_result: typing.Optional[str] = None, + common_info: typing.Optional[str] = None, + model_name: typing.Optional[str] = None, + scripts_base_dir: typing.Optional[str] = None, +) -> int: """Update the perf_super.json file (cumulative) with the latest performance data. - + Args: perf_super_json: Path to perf_super.json file (cumulative). multiple_results: Path to multiple results CSV file. @@ -231,7 +227,7 @@ def update_perf_super_json( common_info: Path to common info JSON file. model_name: The model name. scripts_base_dir: Base directory for scripts (for config file resolution). - + Returns: Number of entries added in this update. """ @@ -239,14 +235,14 @@ def update_perf_super_json( print("📊 UPDATING PERFORMANCE SUPERSET DATABASE") print("=" * 80) print(f"📂 Target file: {perf_super_json}") - + # Load existing perf_super.json perf_super_list = load_perf_super_json(perf_super_json) initial_count = len(perf_super_list) - + # Create config parser config_parser = ConfigParser(scripts_base_dir=scripts_base_dir) - + # Handle different result types if multiple_results: print("🔄 Processing multiple results with configs...") @@ -268,23 +264,23 @@ def update_perf_super_json( else: print("ℹ️ No results to update in perf_super.json") return 0 - + # Write updated perf_super.json write_json(perf_super_list, perf_super_json) entries_added = len(perf_super_list) - initial_count print(f"✅ Successfully updated: {perf_super_json} (added {entries_added} entries)") print("=" * 80 + "\n") - + return entries_added def generate_perf_entry_super_json( perf_super_json: str = "perf_super.json", perf_entry_super_json: str = "perf_entry_super.json", - num_entries: int = 1 + num_entries: int = 1, ) -> None: """Generate perf_entry_super.json (latest entries) from perf_super.json (cumulative). - + Args: perf_super_json: Path to cumulative JSON source perf_entry_super_json: Path to entry JSON output (latest entries only) @@ -293,31 +289,33 @@ def generate_perf_entry_super_json( if not os.path.exists(perf_super_json): print(f"⚠️ {perf_super_json} not found, skipping entry JSON generation") return - + data = read_json(perf_super_json) if not isinstance(data, list): data = [data] - + if not data: print(f"⚠️ {perf_super_json} is empty, skipping entry JSON generation") return - + # Take the latest num_entries entries entry_data = data[-num_entries:] if num_entries > 0 else [data[-1]] - + # Write to perf_entry_super.json write_json(entry_data, perf_entry_super_json) - print(f"✅ Generated entry JSON: {perf_entry_super_json} ({len(entry_data)} entries)") + print( + f"✅ Generated entry JSON: {perf_entry_super_json} ({len(entry_data)} entries)" + ) def convert_super_json_to_csv( perf_super_json: str, output_csv: str, entry_only: bool = False, - num_entries: int = 1 + num_entries: int = 1, ) -> None: """Convert JSON to CSV format. - + Args: perf_super_json: Path to JSON source output_csv: Output CSV path @@ -328,33 +326,33 @@ def convert_super_json_to_csv( if not os.path.exists(perf_super_json): print(f"⚠️ {perf_super_json} not found, skipping CSV generation") return - + data = read_json(perf_super_json) if not isinstance(data, list): data = [data] - + if not data: print(f"⚠️ {perf_super_json} is empty, skipping CSV generation") return - + if entry_only and data: # Take the latest num_entries entries data = data[-num_entries:] if num_entries > 0 else [data[-1]] - + # Convert to DataFrame df = pd.DataFrame(data) - + # Serialize complex fields to JSON strings - if 'configs' in df.columns: - df['configs'] = df['configs'].apply( + if "configs" in df.columns: + df["configs"] = df["configs"].apply( lambda x: json.dumps(x) if x is not None else None ) - - if 'multi_results' in df.columns: - df['multi_results'] = df['multi_results'].apply( + + if "multi_results" in df.columns: + df["multi_results"] = df["multi_results"].apply( lambda x: json.dumps(x) if x is not None else None ) - + # Write to CSV df.to_csv(output_csv, index=False) print(f"✅ Generated CSV: {output_csv} ({len(df)} entries)") @@ -363,10 +361,10 @@ def convert_super_json_to_csv( def update_perf_super_csv( perf_super_json: str = "perf_super.json", perf_super_csv: str = "perf_super.csv", - num_entries: int = 1 + num_entries: int = 1, ) -> None: """Generate perf_entry_super.json, perf_entry_super.csv and perf_super.csv from perf_super.json. - + Args: perf_super_json: Path to cumulative JSON source (perf_super.json) perf_super_csv: Path to cumulative CSV (perf_super.csv) @@ -375,27 +373,22 @@ def update_perf_super_csv( print("\n" + "=" * 80) print("📄 GENERATING FILES FROM PERFORMANCE SUPERSET") print("=" * 80) - + # Generate perf_entry_super.json (latest entries from current run) generate_perf_entry_super_json( perf_super_json=perf_super_json, perf_entry_super_json="perf_entry_super.json", - num_entries=num_entries + num_entries=num_entries, ) - + # Generate perf_entry_super.csv (latest entries from current run) convert_super_json_to_csv( "perf_entry_super.json", # Use the entry JSON as source "perf_entry_super.csv", - entry_only=False # Read all from entry JSON (already filtered) + entry_only=False, # Read all from entry JSON (already filtered) ) - + # Generate perf_super.csv (all entries) - convert_super_json_to_csv( - perf_super_json, - perf_super_csv, - entry_only=False - ) - - print("=" * 80 + "\n") + convert_super_json_to_csv(perf_super_json, perf_super_csv, entry_only=False) + print("=" * 80 + "\n") diff --git a/src/madengine/scripts/common/post_scripts/gpu_info_post.sh b/src/madengine/scripts/common/post_scripts/gpu_info_post.sh index 337a9550..04664d1d 100644 --- a/src/madengine/scripts/common/post_scripts/gpu_info_post.sh +++ b/src/madengine/scripts/common/post_scripts/gpu_info_post.sh @@ -1,8 +1,8 @@ #!/usr/bin/env bash -# +# # Copyright (c) Advanced Micro Devices, Inc. # All rights reserved. -# +# set -x @@ -38,7 +38,7 @@ if [ ! -f "$OUTPUT" ]; then echo "⚠️ Warning: $OUTPUT not found in $(pwd)" echo "⚠️ This may be expected if multiple gpu_info tools are stacked together" echo "⚠️ and only one ran successfully. Checking for any profiler outputs..." - + # Check if prof.csv exists (default output name) if [ -f "prof.csv" ]; then echo "Found prof.csv - renaming to $OUTPUT" @@ -47,11 +47,11 @@ if [ ! -f "$OUTPUT" ]; then echo "Profiler output saved to: $(pwd)/${OUTPUT}" exit 0 fi - + # List all CSV files for debugging echo "Available CSV files in directory:" ls -la *.csv 2>/dev/null || echo "No CSV files found" - + # Don't fail - just warn and exit successfully # This allows other stacked tools to complete their post-scripts echo "⚠️ Profiler output $OUTPUT not found - skipping (non-fatal)" diff --git a/src/madengine/scripts/common/post_scripts/gpu_info_power_stop.sh b/src/madengine/scripts/common/post_scripts/gpu_info_power_stop.sh index 051eb9a7..3b456fd9 100755 --- a/src/madengine/scripts/common/post_scripts/gpu_info_power_stop.sh +++ b/src/madengine/scripts/common/post_scripts/gpu_info_power_stop.sh @@ -1,8 +1,8 @@ #!/usr/bin/env bash -# +# # Copyright (c) Advanced Micro Devices, Inc. # All rights reserved. -# +# # Stop gpu_info_power_profiler and collect output set -x @@ -32,23 +32,23 @@ if ! kill -0 "$PROFILER_PID" 2>/dev/null; then echo "⚠️ Warning: Power profiler process (PID: $PROFILER_PID) is not running" else echo "Sending termination signal to power profiler (PID: $PROFILER_PID)..." - + # Send SIGTERM to gracefully stop the profiler kill -TERM "$PROFILER_PID" 2>/dev/null || true - + # Wait for profiler to finish writing output (max 10 seconds) WAIT_COUNT=0 while kill -0 "$PROFILER_PID" 2>/dev/null && [ $WAIT_COUNT -lt 20 ]; do sleep 0.5 WAIT_COUNT=$((WAIT_COUNT + 1)) done - + # Force kill if still running if kill -0 "$PROFILER_PID" 2>/dev/null; then echo "⚠️ Profiler did not stop gracefully, force killing..." kill -9 "$PROFILER_PID" 2>/dev/null || true fi - + echo "✓ GPU power profiler stopped" fi @@ -63,4 +63,3 @@ if [ -f "/tmp/gpu_info_power_profiler.log" ]; then tail -20 /tmp/gpu_info_power_profiler.log || true echo "==========================" fi - diff --git a/src/madengine/scripts/common/post_scripts/gpu_info_vram_stop.sh b/src/madengine/scripts/common/post_scripts/gpu_info_vram_stop.sh index 221a283a..3ad91d8c 100755 --- a/src/madengine/scripts/common/post_scripts/gpu_info_vram_stop.sh +++ b/src/madengine/scripts/common/post_scripts/gpu_info_vram_stop.sh @@ -1,8 +1,8 @@ #!/usr/bin/env bash -# +# # Copyright (c) Advanced Micro Devices, Inc. # All rights reserved. -# +# # Stop gpu_info_vram_profiler and collect output set -x @@ -32,23 +32,23 @@ if ! kill -0 "$PROFILER_PID" 2>/dev/null; then echo "⚠️ Warning: VRAM profiler process (PID: $PROFILER_PID) is not running" else echo "Sending termination signal to VRAM profiler (PID: $PROFILER_PID)..." - + # Send SIGTERM to gracefully stop the profiler kill -TERM "$PROFILER_PID" 2>/dev/null || true - + # Wait for profiler to finish writing output (max 10 seconds) WAIT_COUNT=0 while kill -0 "$PROFILER_PID" 2>/dev/null && [ $WAIT_COUNT -lt 20 ]; do sleep 0.5 WAIT_COUNT=$((WAIT_COUNT + 1)) done - + # Force kill if still running if kill -0 "$PROFILER_PID" 2>/dev/null; then echo "⚠️ Profiler did not stop gracefully, force killing..." kill -9 "$PROFILER_PID" 2>/dev/null || true fi - + echo "✓ GPU VRAM profiler stopped" fi @@ -63,4 +63,3 @@ if [ -f "/tmp/gpu_info_vram_profiler.log" ]; then tail -20 /tmp/gpu_info_vram_profiler.log || true echo "==========================" fi - diff --git a/src/madengine/scripts/common/post_scripts/post_test.sh b/src/madengine/scripts/common/post_scripts/post_test.sh index 424c012f..31325c53 100644 --- a/src/madengine/scripts/common/post_scripts/post_test.sh +++ b/src/madengine/scripts/common/post_scripts/post_test.sh @@ -1,8 +1,8 @@ #!/bin/bash -# +# # Copyright (c) Advanced Micro Devices, Inc. # All rights reserved. -# +# version=${1:-0} echo "Post-Script test called $version" diff --git a/src/madengine/scripts/common/post_scripts/trace.sh b/src/madengine/scripts/common/post_scripts/trace.sh index 1e489861..ef5cc185 100644 --- a/src/madengine/scripts/common/post_scripts/trace.sh +++ b/src/madengine/scripts/common/post_scripts/trace.sh @@ -1,8 +1,8 @@ #!/usr/bin/env bash -# +# # Copyright (c) Advanced Micro Devices, Inc. # All rights reserved. -# +# set -e set -x @@ -28,7 +28,7 @@ rpd) # Still create output directory and copy what we can touch "$OUTPUT/trace.rpd" # Create empty file so test can find directory structure fi - + echo "RPD post-script: Checking for rpd2tracing.py script..." if [ -f "./rocmProfileData/tools/rpd2tracing.py" ]; then echo "RPD post-script: rpd2tracing.py found" @@ -38,7 +38,7 @@ rpd) else echo "RPD post-script: Skipping rpd2tracing.py because trace.rpd is missing or empty" # Create empty files so the directory structure exists - touch "$OUTPUT/trace.rpd" + touch "$OUTPUT/trace.rpd" touch "$OUTPUT/trace.json" fi else @@ -49,14 +49,14 @@ rpd) touch "$OUTPUT/trace.rpd" touch "$OUTPUT/trace.json" fi - + cp -vLR --preserve=all "$OUTPUT" "$SAVESPACE" ;; rocprof) # Handle both legacy rocprof (results*) and rocprofv3 (different output format) echo "ROCprof post-script: Collecting profiling output..." - + # Check for legacy rocprof results files if ls results* 1> /dev/null 2>&1; then echo "Found rocprof results files" @@ -64,7 +64,7 @@ rocprof) else echo "No rocprof results* files found (may be using rocprofv3)" fi - + # Check for rocprofv3 output directories (UUID pattern like 1e4d92661463/) # rocprofv3 creates directories with hex UUIDs containing .db files found_rocprofv3_output=false @@ -79,18 +79,18 @@ rocprof) fi fi done - + # Also check for other rocprofv3 output patterns if ls rocprofv3-* 1> /dev/null 2>&1; then echo "Found rocprofv3-* files" mv rocprofv3-* "$OUTPUT" 2>/dev/null || true found_rocprofv3_output=true fi - + if [ "$found_rocprofv3_output" = true ]; then echo "Collected rocprofv3 profiling data" fi - + # Check for CSV trace files in subdirectories (rocprof can create hostname subdirectories) # Look for patterns like: hostname/pid_kernel_trace.csv, hostname/pid_hip_api_trace.csv, etc. csv_found=false @@ -106,11 +106,11 @@ rocprof) fi fi done - + if [ "$csv_found" = true ]; then echo "Collected rocprof CSV trace files from subdirectories" fi - + # Consolidate rocprofv3 CSV files so MAD-agent finds rocprofv3_output_* names. # rocprofv3 may write agent_info in -o prefix but kernel_trace/stats with PID prefix or under hostname/pid. for base in agent_info domain_stats kernel_stats kernel_trace hip_api_trace counter_collection; do @@ -123,7 +123,7 @@ rocprof) cp -v "$first" "$canonical" fi done - + # Generate instruction_histogram.json from counter/domain_stats CSV so MAD-agent gets real instruction mix. if [ -f "${OUTPUT}/rocprofv3_output_counter_collection.csv" ] || [ -f "${OUTPUT}/rocprofv3_output_domain_stats.csv" ]; then CONVERTER="$(cd "$(dirname "$0")/../tools" 2>/dev/null && pwd)/rocprof_counter_csv_to_instruction_histogram.py" diff --git a/src/madengine/scripts/common/pre_scripts/gpu_info_power_start.sh b/src/madengine/scripts/common/pre_scripts/gpu_info_power_start.sh index d28c5763..ce2bf7d1 100755 --- a/src/madengine/scripts/common/pre_scripts/gpu_info_power_start.sh +++ b/src/madengine/scripts/common/pre_scripts/gpu_info_power_start.sh @@ -1,8 +1,8 @@ #!/usr/bin/env bash -# +# # Copyright (c) Advanced Micro Devices, Inc. # All rights reserved. -# +# # Start gpu_info_power_profiler in background mode set -x @@ -60,4 +60,3 @@ sleep 2 touch "$PROFILER_START_FILE" echo "✓ GPU power profiler initialization complete" - diff --git a/src/madengine/scripts/common/pre_scripts/gpu_info_pre.sh b/src/madengine/scripts/common/pre_scripts/gpu_info_pre.sh index 60bd60a0..1b56aecf 100644 --- a/src/madengine/scripts/common/pre_scripts/gpu_info_pre.sh +++ b/src/madengine/scripts/common/pre_scripts/gpu_info_pre.sh @@ -1,8 +1,8 @@ #!/usr/bin/env bash -# +# # Copyright (c) Advanced Micro Devices, Inc. # All rights reserved. -# +# gpu_vendor="" if [ -f "/usr/bin/nvidia-smi" ]; then diff --git a/src/madengine/scripts/common/pre_scripts/gpu_info_vram_start.sh b/src/madengine/scripts/common/pre_scripts/gpu_info_vram_start.sh index 2ae8e83d..423ba822 100755 --- a/src/madengine/scripts/common/pre_scripts/gpu_info_vram_start.sh +++ b/src/madengine/scripts/common/pre_scripts/gpu_info_vram_start.sh @@ -1,8 +1,8 @@ #!/usr/bin/env bash -# +# # Copyright (c) Advanced Micro Devices, Inc. # All rights reserved. -# +# # Start gpu_info_vram_profiler in background mode set -x @@ -60,4 +60,3 @@ sleep 2 touch "$PROFILER_START_FILE" echo "✓ GPU VRAM profiler initialization complete" - diff --git a/src/madengine/scripts/common/pre_scripts/miopen_build_latest.sh b/src/madengine/scripts/common/pre_scripts/miopen_build_latest.sh index 5a9b4714..cf7f02ca 100644 --- a/src/madengine/scripts/common/pre_scripts/miopen_build_latest.sh +++ b/src/madengine/scripts/common/pre_scripts/miopen_build_latest.sh @@ -1,8 +1,8 @@ #!/bin/bash -# +# # Copyright (c) Advanced Micro Devices, Inc. # All rights reserved. -# +# set -e set -x diff --git a/src/madengine/scripts/common/pre_scripts/pre_test.sh b/src/madengine/scripts/common/pre_scripts/pre_test.sh index 68849453..e87f429c 100644 --- a/src/madengine/scripts/common/pre_scripts/pre_test.sh +++ b/src/madengine/scripts/common/pre_scripts/pre_test.sh @@ -1,8 +1,8 @@ #!/bin/bash -# +# # Copyright (c) Advanced Micro Devices, Inc. # All rights reserved. -# +# version=${1:-0} echo "Pre-Script test called $version" diff --git a/src/madengine/scripts/common/pre_scripts/rocEnvTool/console.py b/src/madengine/scripts/common/pre_scripts/rocEnvTool/console.py index b91da1a2..b4aaf76a 100644 --- a/src/madengine/scripts/common/pre_scripts/rocEnvTool/console.py +++ b/src/madengine/scripts/common/pre_scripts/rocEnvTool/console.py @@ -5,36 +5,56 @@ """ import subprocess + class Console: - """ Console class - class to run console commands - """ - def __init__(self, shellVerbose=True, live_output=False): - self.shellVerbose = shellVerbose - self.live_output = live_output + """Console class + class to run console commands + """ + + def __init__(self, shellVerbose=True, live_output=False): + self.shellVerbose = shellVerbose + self.live_output = live_output - def sh(self, command, canFail=False, timeout=60, secret=False, prefix=""): - if self.shellVerbose and not secret: - print("> " + command, flush=True) - proc = subprocess.Popen(command, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True, universal_newlines=True, bufsize=1) - try: - if not self.live_output: - outs, errs = proc.communicate(timeout=timeout) - else: - outs = [] - for stdout_line in iter(proc.stdout.readline, ""): - print(prefix+stdout_line, end="" ) - outs.append(stdout_line ) - outs = ''.join(outs) - proc.stdout.close() - proc.wait(timeout=timeout) - except subprocess.TimeoutExpired as exc: - proc.kill() - raise RuntimeError('Console script timeout') from exc - if proc.returncode != 0: - if not canFail: - if not secret: - raise RuntimeError("Subprocess '" + command + "' failed with exit code " + str(proc.returncode) ) + def sh(self, command, canFail=False, timeout=60, secret=False, prefix=""): + if self.shellVerbose and not secret: + print("> " + command, flush=True) + proc = subprocess.Popen( + command, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + shell=True, + universal_newlines=True, + bufsize=1, + ) + try: + if not self.live_output: + outs, errs = proc.communicate(timeout=timeout) else: - raise RuntimeError("Subprocess '" + secret + "' failed with exit code " + str(proc.returncode) ) - return outs.strip() + outs = [] + for stdout_line in iter(proc.stdout.readline, ""): + print(prefix + stdout_line, end="") + outs.append(stdout_line) + outs = "".join(outs) + proc.stdout.close() + proc.wait(timeout=timeout) + except subprocess.TimeoutExpired as exc: + proc.kill() + raise RuntimeError("Console script timeout") from exc + if proc.returncode != 0: + if not canFail: + if not secret: + raise RuntimeError( + "Subprocess '" + + command + + "' failed with exit code " + + str(proc.returncode) + ) + else: + raise RuntimeError( + "Subprocess '" + + secret + + "' failed with exit code " + + str(proc.returncode) + ) + return outs.strip() diff --git a/src/madengine/scripts/common/pre_scripts/rocEnvTool/csv_parser.py b/src/madengine/scripts/common/pre_scripts/rocEnvTool/csv_parser.py index 7c1599ab..8da270d8 100644 --- a/src/madengine/scripts/common/pre_scripts/rocEnvTool/csv_parser.py +++ b/src/madengine/scripts/common/pre_scripts/rocEnvTool/csv_parser.py @@ -2,12 +2,14 @@ Copyright (c) Advanced Micro Devices, Inc. All rights reserved. """ + import os import shlex import shutil + from console import Console -''' +""" CSV Parser - parses various sys config log files and dumps into CSV. Only the below tags are supported. Enable dumping it via adding --dump-csv in rocEnvTool @@ -21,7 +23,9 @@ rocm_env_variables pip_list numa_balancing -''' +""" + + class CSVParser: def __init__(self, filename, sys_config_files_path, tags, path_resolver=None): self.filename = filename @@ -46,7 +50,7 @@ def determine_gpu_device_type(self): return gpu_device_type def get_log_file_data(self, log_file_path): - fs = open(log_file_path, 'r') + fs = open(log_file_path, "r") lines = fs.readlines() fs.close() @@ -64,7 +68,7 @@ def dump_os_information_in_csv(self, os_info_path): info_list.append("Node name|" + values[1]) info_list.append("Kernel version| " + values[2]) if "PRETTY_NAME" in line: - info_list.append("OS version|" + line.split("=")[1].replace('"', '')) + info_list.append("OS version|" + line.split("=")[1].replace('"', "")) return info_list def dump_cpu_information_in_csv(self, cpu_log_path): @@ -89,24 +93,24 @@ def dump_gpu_information_in_csv(self, gpu_log_path, device_type): num_gpu = 0 for j in range(1, len(lines)): line = lines[j].rstrip() - if ("Name:" in line and "gfx" in line): + if "Name:" in line and "gfx" in line: name = line.split(":")[1].lstrip() - if ("Uuid:" in line): + if "Uuid:" in line: uuid = line.split(":")[1].lstrip() - if ("Marketing Name:" in line): + if "Marketing Name:" in line: marketing_name = line.split(":")[1].lstrip() - if ("Vendor Name:" in line): + if "Vendor Name:" in line: vendor_name = line.split(":")[1].lstrip() - if ("Device Type:" in line): + if "Device Type:" in line: device_type = line.split(":")[1].lstrip() if device_type == "GPU": break for j in range(1, len(lines)): line = lines[j].rstrip() - if ("Device Type:" in line): + if "Device Type:" in line: device_type = line.split(":")[1].lstrip() - if (device_type == "GPU"): + if device_type == "GPU": num_gpu += 1 info_list.append("Name|" + name) info_list.append("Uuid|" + uuid) @@ -196,7 +200,7 @@ def dump_rocm_env_variables_in_csv(self, log_path): info_list.append(lines[0].rstrip()) for j in range(1, len(lines)): env_values = lines[j].rstrip().split("=") - if (env_values[0]): + if env_values[0]: info_list.append(env_values[0] + "|" + env_values[1]) return info_list @@ -256,7 +260,7 @@ def dump_cuda_env_variables_in_csv(self, log_path): def dump_csv_output(self): gpu_device_type = self.gpu_device_type - fs = open(self.filename, 'w') + fs = open(self.filename, "w") fs.write("sep=|") fs.write("\n") sys_config_info = [] @@ -270,23 +274,37 @@ def dump_csv_output(self): if tag == "cpu_information": sys_config_info.extend(self.dump_cpu_information_in_csv(log_path)) if tag == "gpu_information": - sys_config_info.extend(self.dump_gpu_information_in_csv(log_path, gpu_device_type)) + sys_config_info.extend( + self.dump_gpu_information_in_csv(log_path, gpu_device_type) + ) if tag == "rocm_smi_gpudeviceid": - sys_config_info.extend(self.dump_rocm_smi_gpudeviceid_in_csv(log_path)) + sys_config_info.extend( + self.dump_rocm_smi_gpudeviceid_in_csv(log_path) + ) if tag == "memory_information": - sys_config_info.extend(self.dump_memory_information_in_csv(log_path)) + sys_config_info.extend( + self.dump_memory_information_in_csv(log_path) + ) if tag == "rocm_information": sys_config_info.extend(self.dump_rocm_information_in_csv(log_path)) if tag == "rocm_packages_installed": - sys_config_info.extend(self.dump_rocm_packages_installed_in_csv(log_path)) + sys_config_info.extend( + self.dump_rocm_packages_installed_in_csv(log_path) + ) if tag == "rocm_env_variables": - sys_config_info.extend(self.dump_rocm_env_variables_in_csv(log_path)) + sys_config_info.extend( + self.dump_rocm_env_variables_in_csv(log_path) + ) if tag == "cuda_information": sys_config_info.extend(self.dump_cuda_information_in_csv(log_path)) if tag == "cuda_packages_installed": - sys_config_info.extend(self.dump_cuda_packages_installed_in_csv(log_path)) + sys_config_info.extend( + self.dump_cuda_packages_installed_in_csv(log_path) + ) if tag == "cuda_env_variables": - sys_config_info.extend(self.dump_cuda_env_variables_in_csv(log_path)) + sys_config_info.extend( + self.dump_cuda_env_variables_in_csv(log_path) + ) if tag == "pip_list": sys_config_info.extend(self.dump_pip_list_in_csv(log_path)) if tag == "numa_balancing": @@ -298,14 +316,14 @@ def dump_csv_output(self): fs.write(sys_config_info[j]) fs.write("\n") fs.close() - print("\n" + "="*60) + print("\n" + "=" * 60) print(f"✅ SUCCESS: System config data dumped to {self.filename}") - print("="*60 + "\n") + print("=" * 60 + "\n") def print_csv_output(self): - print("\n" + "="*80) + print("\n" + "=" * 80) print("📋 SYSTEM CONFIG INFO - ENVIRONMENT VARIABLES") - print("="*80) + print("=" * 80) if self.sys_config_info_list: for j in range(len(self.sys_config_info_list)): line = self.sys_config_info_list[j] @@ -317,4 +335,4 @@ def print_csv_output(self): print(f"📌 {line}") else: print("❌ No system config information available") - print("="*80 + "\n") + print("=" * 80 + "\n") diff --git a/src/madengine/scripts/common/pre_scripts/rocEnvTool/rocenv_tool.py b/src/madengine/scripts/common/pre_scripts/rocEnvTool/rocenv_tool.py index b2288dea..da94d25b 100644 --- a/src/madengine/scripts/common/pre_scripts/rocEnvTool/rocenv_tool.py +++ b/src/madengine/scripts/common/pre_scripts/rocEnvTool/rocenv_tool.py @@ -2,6 +2,7 @@ Copyright (c) Advanced Micro Devices, Inc. All rights reserved. """ + import os import sys import argparse @@ -21,10 +22,11 @@ class CommandInfo: - ''' - section_info (str): Name of the section. - cmds (list) : command list for a particular section. - ''' + """ + section_info (str): Name of the section. + cmds (list) : command list for a particular section. + """ + def __init__(self, section_info, cmds): self.section_info = section_info self.cmds = cmds @@ -34,53 +36,53 @@ class RocmPathResolver: """ Detects and resolves ROCm installation paths for both TheRock and traditional installations. """ - + def __init__(self, verbose: bool = False): self.verbose = verbose - self.installation_type = 'unknown' + self.installation_type = "unknown" self.rocm_root = None self.paths = { - 'rocminfo': None, - 'rocm_smi': None, - 'hipcc': None, - 'amdclang': None, - 'version_file': None, - 'manifest_file': None, + "rocminfo": None, + "rocm_smi": None, + "hipcc": None, + "amdclang": None, + "version_file": None, + "manifest_file": None, } self.therock_details = {} self.detect() - + def log(self, message: str): """Print verbose log messages.""" if self.verbose: print(f"[DEBUG] {message}") - + def detect(self): """Detect ROCm installation type and locate components.""" # Method 1: Check for TheRock via rocm-sdk command if self._detect_therock_python_package(): return - + # Method 2: Check environment variables for TheRock if self._detect_therock_from_env(): return - + # Method 3: Check for TheRock in common paths if self._detect_therock_tarball(): return - + # Method 4: Fallback to traditional ROCm if self._detect_traditional_rocm(): return - + # Method 5: Try to find binaries in PATH self._detect_from_path() - + def _is_therock_installation(self, path: Path) -> bool: """Check if a path contains TheRock installation markers.""" if not path.exists(): return False - + # Check for TheRock manifest manifest_path = path / "share" / "therock" / "therock_manifest.json" if manifest_path.exists(): @@ -88,27 +90,27 @@ def _is_therock_installation(self, path: Path) -> bool: try: with open(manifest_path, "r") as f: manifest = json.load(f) - self.therock_details['manifest'] = manifest + self.therock_details["manifest"] = manifest except Exception as e: self.log(f"Error reading manifest: {e}") return True - + # Check for dist_info.json dist_info_path = path / "share" / "therock" / "dist_info.json" if dist_info_path.exists(): self.log(f"Found TheRock dist_info at {dist_info_path}") return True - + return False - + def _detect_therock_python_package(self) -> bool: """Detect TheRock via Python package installation.""" self.log("Checking for rocm-sdk command...") - + rocm_sdk_path = shutil.which("rocm-sdk") if rocm_sdk_path: self.log(f"Found rocm-sdk at {rocm_sdk_path}") - + try: # Get root path from rocm-sdk result = subprocess.run( @@ -120,36 +122,36 @@ def _detect_therock_python_package(self) -> bool: if result.returncode == 0: root_path = Path(result.stdout.strip()) if self._is_therock_installation(root_path): - self.installation_type = 'therock' + self.installation_type = "therock" self.rocm_root = str(root_path) self._populate_therock_paths(root_path) return True except Exception as e: self.log(f"Error getting rocm-sdk path: {e}") - + return False - + def _detect_therock_from_env(self) -> bool: """Detect TheRock from environment variables.""" self.log("Checking environment variables...") - - for var in ['ROCM_PATH', 'ROCM_HOME', 'HIP_PATH']: + + for var in ["ROCM_PATH", "ROCM_HOME", "HIP_PATH"]: value = os.environ.get(var) if value: path = Path(value) if self._is_therock_installation(path): self.log(f"Found TheRock via ${var}={value}") - self.installation_type = 'therock' + self.installation_type = "therock" self.rocm_root = str(path) self._populate_therock_paths(path) return True - + return False - + def _detect_therock_tarball(self) -> bool: """Detect TheRock tarball installations in common paths.""" self.log("Checking common TheRock installation paths...") - + common_paths = [ Path("/opt/rocm"), Path.home() / "rocm", @@ -157,87 +159,95 @@ def _detect_therock_tarball(self) -> bool: Path("/usr/local/rocm"), Path.home() / ".local" / "rocm", ] - + for path in common_paths: if self._is_therock_installation(path): self.log(f"Found TheRock at {path}") - self.installation_type = 'therock' + self.installation_type = "therock" self.rocm_root = str(path) self._populate_therock_paths(path) return True - + return False - + def _detect_traditional_rocm(self) -> bool: """Detect traditional ROCm installation.""" self.log("Checking for traditional ROCm installation...") - + # Check for traditional ROCm marker version_file = Path("/opt/rocm/.info/version") if version_file.exists(): self.log("Found traditional ROCm at /opt/rocm") - self.installation_type = 'traditional' + self.installation_type = "traditional" self.rocm_root = "/opt/rocm" self._populate_traditional_paths() return True - + return False - + def _detect_from_path(self): """Try to find ROCm binaries in PATH.""" self.log("Searching for ROCm binaries in PATH...") - + # Try to find rocminfo rocminfo = shutil.which("rocminfo") if rocminfo: - self.paths['rocminfo'] = rocminfo + self.paths["rocminfo"] = rocminfo # Try to infer root from binary location rocminfo_path = Path(rocminfo) if rocminfo_path.exists(): potential_root = rocminfo_path.parent.parent if self._is_therock_installation(potential_root): - self.installation_type = 'therock' + self.installation_type = "therock" self.rocm_root = str(potential_root) self._populate_therock_paths(potential_root) else: - self.installation_type = 'unknown' + self.installation_type = "unknown" self.rocm_root = str(potential_root) - + # Try to find other binaries - self.paths['rocm_smi'] = shutil.which("rocm-smi") - self.paths['hipcc'] = shutil.which("hipcc") - self.paths['amdclang'] = shutil.which("amdclang") - + self.paths["rocm_smi"] = shutil.which("rocm-smi") + self.paths["hipcc"] = shutil.which("hipcc") + self.paths["amdclang"] = shutil.which("amdclang") + def _populate_therock_paths(self, root: Path): """Populate paths for TheRock installation.""" bin_dir = root / "bin" - - self.paths['rocminfo'] = str(bin_dir / "rocminfo") if (bin_dir / "rocminfo").exists() else None - self.paths['rocm_smi'] = str(bin_dir / "rocm-smi") if (bin_dir / "rocm-smi").exists() else None - self.paths['hipcc'] = str(bin_dir / "hipcc") if (bin_dir / "hipcc").exists() else None - self.paths['amdclang'] = str(bin_dir / "amdclang") if (bin_dir / "amdclang").exists() else None - + + self.paths["rocminfo"] = ( + str(bin_dir / "rocminfo") if (bin_dir / "rocminfo").exists() else None + ) + self.paths["rocm_smi"] = ( + str(bin_dir / "rocm-smi") if (bin_dir / "rocm-smi").exists() else None + ) + self.paths["hipcc"] = ( + str(bin_dir / "hipcc") if (bin_dir / "hipcc").exists() else None + ) + self.paths["amdclang"] = ( + str(bin_dir / "amdclang") if (bin_dir / "amdclang").exists() else None + ) + # Check for manifest manifest = root / "share" / "therock" / "therock_manifest.json" if manifest.exists(): - self.paths['manifest_file'] = str(manifest) - + self.paths["manifest_file"] = str(manifest) + def _populate_traditional_paths(self): """Populate paths for traditional ROCm installation.""" - self.paths['rocminfo'] = "/opt/rocm/bin/rocminfo" - self.paths['rocm_smi'] = "/opt/rocm/bin/rocm-smi" - self.paths['hipcc'] = "/opt/rocm/bin/hipcc" - self.paths['version_file'] = "/opt/rocm/.info/version" - + self.paths["rocminfo"] = "/opt/rocm/bin/rocminfo" + self.paths["rocm_smi"] = "/opt/rocm/bin/rocm-smi" + self.paths["hipcc"] = "/opt/rocm/bin/hipcc" + self.paths["version_file"] = "/opt/rocm/.info/version" + def get_version(self) -> str: """Get ROCm version string.""" - if self.installation_type == 'therock': + if self.installation_type == "therock": return self._get_therock_version() - elif self.installation_type == 'traditional': + elif self.installation_type == "traditional": return self._get_traditional_version() else: return "unknown" - + def _get_therock_version(self) -> str: """Get TheRock version from manifest or rocm-sdk.""" # Try rocm-sdk command @@ -253,14 +263,14 @@ def _get_therock_version(self) -> str: return result.stdout.strip() except Exception: pass - + # Try manifest file - if self.therock_details.get('manifest'): - commit = self.therock_details['manifest'].get('the_rock_commit', 'unknown') + if self.therock_details.get("manifest"): + commit = self.therock_details["manifest"].get("the_rock_commit", "unknown") return f"TheRock (commit: {commit[:8]})" - + return "TheRock (version unknown)" - + def _get_traditional_version(self) -> str: """Get traditional ROCm version from version file or header.""" # Try version file @@ -270,14 +280,14 @@ def _get_traditional_version(self) -> str: return version_file.read_text().strip() except Exception: pass - + # Try version header version_header = Path("/opt/rocm/include/rocm-core/rocm_version.h") if version_header.exists(): try: content = version_header.read_text() major = minor = patch = 0 - for line in content.split('\n'): + for line in content.split("\n"): if "#define ROCM_VERSION_MAJOR" in line: major = line.split()[-1] if "#define ROCM_VERSION_MINOR" in line: @@ -287,7 +297,7 @@ def _get_traditional_version(self) -> str: return f"rocm-{major}.{minor}.{patch}" except Exception: pass - + return "unknown" @@ -308,11 +318,11 @@ def print_hardware_information(): if os.path.isfile(path): cmd = path break - + if cmd is None: print("WARNING: Install lshw to get hardware information") print(" (TheRock images may not include this by default)") - + if cmd is not None: cmd_info = CommandInfo("HardwareInformation", [cmd]) return cmd_info @@ -333,11 +343,11 @@ def print_cpu_hardware_information(): def print_gpu_hardware_information(gpu_device_type, path_resolver): if gpu_device_type == "AMD": # Use dynamic path from resolver - cmd = path_resolver.paths.get('rocminfo') or "rocminfo" + cmd = path_resolver.paths.get("rocminfo") or "rocminfo" elif gpu_device_type == "NVIDIA": cmd = "nvidia-smi -L" else: - print ("WARNING: Unknown GPU device detected") + print("WARNING: Unknown GPU device detected") cmd = "echo 'Unknown GPU device'" cmd_info = CommandInfo("GPU Information", [cmd]) return cmd_info @@ -372,28 +382,28 @@ def print_memory_information(): ## ROCm version data def print_rocm_version_information(path_resolver): global rocm_version - + # List all ROCm-like directories cmd1 = "ls -v -d /opt/rocm* 2>/dev/null || echo 'No /opt/rocm* directories found'" - + # Get version from resolver rocm_version = path_resolver.get_version() - + cmd2 = f"echo '==== Installation Type: {path_resolver.installation_type} ===='" rocm_root_display = path_resolver.rocm_root or "Not found" cmd3 = f"echo '==== ROCm Root: {rocm_root_display} ===='" cmd4 = f"echo '==== Using {rocm_version} to collect ROCm information ===='" - + cmds = [cmd1, cmd2, cmd3, cmd4] - + # Add TheRock-specific info - if path_resolver.installation_type == 'therock': - manifest_file = path_resolver.paths.get('manifest_file') + if path_resolver.installation_type == "therock": + manifest_file = path_resolver.paths.get("manifest_file") if manifest_file: cmd5 = f"echo '==== TheRock Manifest: {manifest_file} ===='" cmd6 = f"cat {manifest_file}" cmds.extend([cmd5, cmd6]) - + cmd_info = CommandInfo("Available ROCm versions", cmds) return cmd_info @@ -401,19 +411,19 @@ def print_rocm_version_information(path_resolver): def print_rocm_repo_setup(path_resolver): """Print repo setup - only for traditional ROCm installations.""" cmds = [] - - if path_resolver.installation_type == 'therock': + + if path_resolver.installation_type == "therock": cmds.append("echo 'TheRock does not use traditional package repositories'") cmds.append("echo 'TheRock is installed via Python pip packages or tarballs'") - + # Try to get pip package info if shutil.which("rocm-sdk"): cmds.append("echo 'Checking rocm-sdk Python package...'") cmds.append("rocm-sdk version || true") cmds.append("rocm-sdk path --root || true") - + # Check if we're in a venv - venv_path = os.environ.get('VIRTUAL_ENV') + venv_path = os.environ.get("VIRTUAL_ENV") if venv_path: cmds.append(f"echo 'Virtual environment: {venv_path}'") cmds.append("pip list | grep -i rocm || true") @@ -426,10 +436,10 @@ def print_rocm_repo_setup(path_resolver): cmd = "/bin/grep -i -E 'rocm|amdgpu' /etc/apt/sources.list.d/* || echo 'No ROCm repos found'" elif os.path.exists("/etc/yum.repos.d/"): cmd = "/bin/grep -i -E 'rocm|amdgpu' /etc/yum.repos.d/* || echo 'No ROCm repos found'" - + if cmd: cmds.append(cmd) - + cmd_info = CommandInfo("ROCm Repo Setup", cmds) return cmd_info @@ -437,27 +447,35 @@ def print_rocm_repo_setup(path_resolver): def print_rocm_packages_installed(path_resolver): """Print installed ROCm packages - adapted for TheRock.""" cmds = [] - - if path_resolver.installation_type == 'therock': + + if path_resolver.installation_type == "therock": # Add Pkg type line for CSV parser compatibility cmds.append("echo ' Pkg type: therock'") cmds.append("echo 'Installation Type: TheRock (no system packages)'") cmds.append("echo ''") - + # Check Python packages cmds.append("echo '=== Python ROCm Packages ==='") - cmds.append("pip list 2>/dev/null | grep -i -E 'rocm|hip|torch' || echo 'No Python ROCm packages found'") - + cmds.append( + "pip list 2>/dev/null | grep -i -E 'rocm|hip|torch' || echo 'No Python ROCm packages found'" + ) + # List files in TheRock installation if path_resolver.rocm_root: cmds.append("echo ''") - cmds.append(f"echo '=== TheRock Installation Contents ({path_resolver.rocm_root}) ==='") + cmds.append( + f"echo '=== TheRock Installation Contents ({path_resolver.rocm_root}) ==='" + ) cmds.append(f"ls -lh {path_resolver.rocm_root}/bin/ 2>/dev/null || true") - cmds.append(f"ls -lh {path_resolver.rocm_root}/lib/ 2>/dev/null | head -20 || true") - + cmds.append( + f"ls -lh {path_resolver.rocm_root}/lib/ 2>/dev/null | head -20 || true" + ) + # Check for dist_info if path_resolver.rocm_root: - dist_info = Path(path_resolver.rocm_root) / "share" / "therock" / "dist_info.json" + dist_info = ( + Path(path_resolver.rocm_root) / "share" / "therock" / "dist_info.json" + ) if dist_info.exists(): cmds.append("echo ''") cmds.append("echo '=== TheRock Distribution Info ==='") @@ -472,20 +490,20 @@ def print_rocm_packages_installed(path_resolver): k, v = line.rstrip().split("=", 1) d[k] = v.strip('"') except Exception: - d = {'ID_LIKE': 'unknown'} - - pkgtype = d.get('ID_LIKE', d.get('ID', 'unknown')) + d = {"ID_LIKE": "unknown"} + + pkgtype = d.get("ID_LIKE", d.get("ID", "unknown")) # Note: Format must match csv_parser.py expectations (space before "Pkg") cmd1 = "echo ' Pkg type: '" + pkgtype cmds.append(cmd1) - - if 'debian' in pkgtype.lower(): + + if "debian" in pkgtype.lower(): cmd = "/usr/bin/dpkg -l 2>/dev/null | /bin/grep -i -E 'ocl-icd|kfdtest|llvm-amd|miopen|half|^ii hip|hcc|hsa|rocm|atmi|^ii comgr|composa|amd-smi|aomp|amdgpu|rock|mivision|migraph|rocprofiler|roctracer|rocbl|hipify|rocsol|rocthr|rocff|rocalu|rocprim|rocrand|rccl|rocspar|rdc|rocwmma|rpp|openmp|amdfwflash|ocl |opencl' | /usr/bin/sort || echo 'No packages found'" else: cmd = "/usr/bin/rpm -qa 2>/dev/null | /bin/grep -i -E 'ocl-icd|kfdtest|llvm-amd|miopen|half|hip|hcc|hsa|rocm|atmi|comgr|composa|amd-smi|aomp|amdgpu|rock|mivision|migraph|rocprofiler|roctracer|rocblas|hipify|rocsol|rocthr|rocff|rocalu|rocprim|rocrand|rccl|rocspar|rdc|rocwmma|rpp|openmp|amdfwflash|ocl|opencl' | /usr/bin/sort || echo 'No packages found'" - + cmds.append(cmd) - + cmd_info = CommandInfo("ROCm Packages Installed", cmds) return cmd_info @@ -498,23 +516,29 @@ def print_rocm_environment_variables(): def print_rocm_smi_details(smi_config, path_resolver): cmd_info = None - + # Use dynamic path - rocm_smi_cmd = path_resolver.paths.get('rocm_smi') or "rocm-smi" - + rocm_smi_cmd = path_resolver.paths.get("rocm_smi") or "rocm-smi" + if smi_config == "rocm_smi": - cmd_info = CommandInfo("ROCm SMI", [f"{rocm_smi_cmd} || echo 'rocm-smi not available'"]) + cmd_info = CommandInfo( + "ROCm SMI", [f"{rocm_smi_cmd} || echo 'rocm-smi not available'"] + ) elif smi_config == "ifwi_version": ifwi_cmd = f"{rocm_smi_cmd} -v || echo 'IFWI version not available'" cmd_info = CommandInfo("IFWI version", [ifwi_cmd]) elif smi_config == "rocm_smi_showhw": - showhw_cmd = f"{rocm_smi_cmd} --showhw || echo 'rocm-smi --showhw not available'" + showhw_cmd = ( + f"{rocm_smi_cmd} --showhw || echo 'rocm-smi --showhw not available'" + ) cmd_info = CommandInfo("ROCm SMI showhw", [showhw_cmd]) elif smi_config == "rocm_smi_pcie": pcie_cmd = f"{rocm_smi_cmd} -c 2>/dev/null | /bin/grep -i -E 'pcie' || echo 'PCIe info not available'" cmd_info = CommandInfo("ROCm SMI pcieclk clock", [pcie_cmd]) elif smi_config == "rocm_smi_pids": - pids_cmd1 = "ls /sys/class/kfd/kfd/proc/ 2>/dev/null || echo 'KFD proc not available'" + pids_cmd1 = ( + "ls /sys/class/kfd/kfd/proc/ 2>/dev/null || echo 'KFD proc not available'" + ) pids_cmd2 = f"{rocm_smi_cmd} --showpids || echo 'showpids not available'" cmd_info = CommandInfo("KFD PIDs sysfs kfd proc", [pids_cmd1, pids_cmd2]) elif smi_config == "rocm_smi_topology": @@ -524,13 +548,19 @@ def print_rocm_smi_details(smi_config, path_resolver): serial_cmd = f"{rocm_smi_cmd} --showserial || echo 'showserial not available'" cmd_info = CommandInfo("showserial", [serial_cmd]) elif smi_config == "rocm_smi_showperflevel": - perf_cmd = f"{rocm_smi_cmd} --showperflevel || echo 'showperflevel not available'" + perf_cmd = ( + f"{rocm_smi_cmd} --showperflevel || echo 'showperflevel not available'" + ) cmd_info = CommandInfo("showperflevel", [perf_cmd]) elif smi_config == "rocm_smi_showrasinfo": - showrasinfo_cmd = f"{rocm_smi_cmd} --showrasinfo all || echo 'showrasinfo not available'" + showrasinfo_cmd = ( + f"{rocm_smi_cmd} --showrasinfo all || echo 'showrasinfo not available'" + ) cmd_info = CommandInfo("ROCm SMI showrasinfo all", [showrasinfo_cmd]) elif smi_config == "rocm_smi_showxgmierr": - showxgmierr_cmd = f"{rocm_smi_cmd} --showxgmierr || echo 'showxgmierr not available'" + showxgmierr_cmd = ( + f"{rocm_smi_cmd} --showxgmierr || echo 'showxgmierr not available'" + ) cmd_info = CommandInfo("ROCm SMI showxgmierr", [showxgmierr_cmd]) elif smi_config == "rocm_smi_clocks": clock_cmd = f"{rocm_smi_cmd} -cga || echo 'clock info not available'" @@ -539,19 +569,23 @@ def print_rocm_smi_details(smi_config, path_resolver): compute_cmd = f"{rocm_smi_cmd} --showcomputepartition || echo 'showcomputepartition not available'" cmd_info = CommandInfo("ROCm Show computepartition", [compute_cmd]) elif smi_config == "rocm_smi_nodesbw": - nodesbw_cmd = f"{rocm_smi_cmd} --shownodesbw || echo 'shownodesbw not available'" + nodesbw_cmd = ( + f"{rocm_smi_cmd} --shownodesbw || echo 'shownodesbw not available'" + ) cmd_info = CommandInfo("ROCm Show Nodebsion", [nodesbw_cmd]) elif smi_config == "rocm_smi_gpudeviceid": - gpudeviceid_cmd = f"{rocm_smi_cmd} -i -d 0 || echo 'GPU device ID not available'" + gpudeviceid_cmd = ( + f"{rocm_smi_cmd} -i -d 0 || echo 'GPU device ID not available'" + ) cmd_info = CommandInfo("ROCM Show GPU Device ID", [gpudeviceid_cmd]) else: cmd_info = None - + return cmd_info def print_rocm_info_details(path_resolver): - rocminfo_cmd = path_resolver.paths.get('rocminfo') or "rocminfo" + rocminfo_cmd = path_resolver.paths.get("rocminfo") or "rocminfo" cmd = f"{rocminfo_cmd} || echo 'rocminfo not available'" cmd_info = CommandInfo("rocminfo", [cmd]) return cmd_info @@ -566,25 +600,38 @@ def print_dmesg_logs(ignore_prev_boot_logs=True): cmd1_str = "WARNING: Persistent logging possibly disabled.\\n" cmd1_str = cmd1_str + "WARNING: Please run: \\n" cmd1_str = cmd1_str + " sudo mkdir -p /var/log/journal\\n" - cmd1_str = cmd1_str + " sudo systemctl restart systemd-journald.service \\n" - cmd1_str = cmd1_str + "WARNING: to enable persistent boot logs for collection and analysis.\\n" + cmd1_str = ( + cmd1_str + " sudo systemctl restart systemd-journald.service \\n" + ) + cmd1_str = ( + cmd1_str + + "WARNING: to enable persistent boot logs for collection and analysis.\\n" + ) cmd1_str = "echo '" + cmd1_str + "'" cmds.append(cmd1_str) cmds.append("echo 'Section: dmesg boot logs'") - cmds.append("/bin/dmesg -T 2>/dev/null | /bin/grep -i -E ' Linux v| Command line|power|pnp|pci|gpu|drm|error|xgmi|panic|watchdog|bug|nmi|dazed|too|mce|edac|oop|fail|fault|atom|bios|kfd|vfio|iommu|ras_mask|ECC|smpboot.*CPU|pcieport.*AER|amdfwflash' || echo 'dmesg not available'") - + cmds.append( + "/bin/dmesg -T 2>/dev/null | /bin/grep -i -E ' Linux v| Command line|power|pnp|pci|gpu|drm|error|xgmi|panic|watchdog|bug|nmi|dazed|too|mce|edac|oop|fail|fault|atom|bios|kfd|vfio|iommu|ras_mask|ECC|smpboot.*CPU|pcieport.*AER|amdfwflash' || echo 'dmesg not available'" + ) + if not ignore_prev_boot_logs: cmd_exec = shutil.which("journalctl") - + if cmd_exec is not None: cmds.append("echo 'Section: Current boot logs'") boot_exec = "/bin/grep -i -E ' Linux v| Command line|power|pnp|pci|gpu|drm|error|xgmi|panic|watchdog|bug|nmi|dazed|too|mce|edac|oop|fail|fault|atom|bios|kfd|vfio|iommu|ras_mask|ECC|smpboot.*CPU|pcieport.*AER|amdfwflash'" - cmds.append(f"{cmd_exec} -b 2>/dev/null | {boot_exec} || echo 'journalctl not available'") + cmds.append( + f"{cmd_exec} -b 2>/dev/null | {boot_exec} || echo 'journalctl not available'" + ) cmds.append("echo 'Section: Previous boot logs'") - cmds.append(f"{cmd_exec} -b 1 2>/dev/null | {boot_exec} || echo 'Previous boot logs not available'") + cmds.append( + f"{cmd_exec} -b 1 2>/dev/null | {boot_exec} || echo 'Previous boot logs not available'" + ) cmds.append("echo 'Section: Second boot logs'") - cmds.append(f"{cmd_exec} -b 2 2>/dev/null | {boot_exec} || echo 'Second boot logs not available'") + cmds.append( + f"{cmd_exec} -b 2 2>/dev/null | {boot_exec} || echo 'Second boot logs not available'" + ) cmd_info = CommandInfo("dmesg GPU/DRM/ATOM/BIOS", cmds) return cmd_info @@ -631,21 +678,23 @@ def print_cuda_packages_installed(): if "=" in line: k, v = line.rstrip().split("=", 1) d[k] = v.strip('"') - - pkgtype = d.get('ID_LIKE', d.get('ID', 'unknown')) + + pkgtype = d.get("ID_LIKE", d.get("ID", "unknown")) # Note: Format must match csv_parser.py expectations (space before "Pkg") cmd1 = "echo ' Pkg type: '" + pkgtype cmd2 = None - - if 'debian' in pkgtype.lower(): + + if "debian" in pkgtype.lower(): cmd2 = "/usr/bin/dpkg -l 2>/dev/null | /bin/grep -i -E 'cuda|cu|atlas|hdf5|nccl|nvinfer|nvjpeg|onnx' || echo 'No CUDA packages found'" else: cmd2 = "/usr/bin/rpm -qa 2>/dev/null | /bin/grep -i -E 'cuda|cu|atlas|hdf5|nccl|nvinfer|nvjpeg|onnx' || echo 'No CUDA packages found'" - + cmd_info = CommandInfo("CUDA Packages Installed", [cmd1, cmd2]) except Exception as e: - cmd_info = CommandInfo("CUDA Packages Installed", [f"echo 'Error checking packages: {e}'"]) - + cmd_info = CommandInfo( + "CUDA Packages Installed", [f"echo 'Error checking packages: {e}'"] + ) + return cmd_info @@ -663,7 +712,7 @@ def dump_system_env_information(configs, output_name): out_path = os.path.join(out_dir, config) os.makedirs(out_path) log_file = out_path + "/" + config + ".txt" - fs = open(log_file, 'w') + fs = open(log_file, "w") cmd_info = env_map[config] if cmd_info is not None: @@ -683,63 +732,91 @@ def dump_system_env_information(configs, output_name): def determine_gpu_device_type(path_resolver): gpu_device_type = "" - + # Try rocm-smi - rocm_smi_cmd = path_resolver.paths.get('rocm_smi') or "rocm-smi" + rocm_smi_cmd = path_resolver.paths.get("rocm_smi") or "rocm-smi" rocm_smi_out = console.sh(f"{rocm_smi_cmd} 2>/dev/null || true", canFail=True) - + # Try nvidia-smi nv_smi_out = console.sh("nvidia-smi -L 2>/dev/null || true", canFail=True) - + if rocm_smi_out and "not found" not in rocm_smi_out and len(rocm_smi_out) > 10: gpu_device_type = "AMD" elif nv_smi_out and "not found" not in nv_smi_out and len(nv_smi_out) > 10: gpu_device_type = "NVIDIA" - + return gpu_device_type def generate_env_info(gpu_device_type, path_resolver): global env_map - + print(f"Installation Type: {path_resolver.installation_type}") print(f"ROCm Root: {path_resolver.rocm_root or 'Not found'}") print(f"GPU Device Type: {gpu_device_type or 'Unknown'}") - + env_map["hardware_information"] = print_hardware_information() env_map["cpu_information"] = print_cpu_hardware_information() - env_map["gpu_information"] = print_gpu_hardware_information(gpu_device_type, path_resolver) + env_map["gpu_information"] = print_gpu_hardware_information( + gpu_device_type, path_resolver + ) env_map["bios_settings"] = print_bios_settings() env_map["os_information"] = print_os_information() env_map["dmsg_gpu_drm_atom_logs"] = print_dmesg_logs(ignore_prev_boot_logs=True) env_map["amdgpu_modinfo"] = print_amdgpu_modinfo() env_map["memory_information"] = print_memory_information() - + if gpu_device_type == "AMD": env_map["rocm_information"] = print_rocm_version_information(path_resolver) env_map["rocm_repo_setup"] = print_rocm_repo_setup(path_resolver) - env_map["rocm_packages_installed"] = print_rocm_packages_installed(path_resolver) + env_map["rocm_packages_installed"] = print_rocm_packages_installed( + path_resolver + ) env_map["rocm_env_variables"] = print_rocm_environment_variables() env_map["rocm_smi"] = print_rocm_smi_details("rocm_smi", path_resolver) env_map["ifwi_version"] = print_rocm_smi_details("ifwi_version", path_resolver) - env_map["rocm_smi_showhw"] = print_rocm_smi_details("rocm_smi_showhw", path_resolver) - env_map["rocm_smi_pcie"] = print_rocm_smi_details("rocm_smi_pcie", path_resolver) - env_map["rocm_smi_pids"] = print_rocm_smi_details("rocm_smi_pids", path_resolver) - env_map["rocm_smi_topology"] = print_rocm_smi_details("rocm_smi_topology", path_resolver) - env_map["rocm_smi_showserial"] = print_rocm_smi_details("rocm_smi_showserial", path_resolver) - env_map["rocm_smi_showperflevel"] = print_rocm_smi_details("rocm_smi_showperflevel", path_resolver) - env_map["rocm_smi_showrasinfo"] = print_rocm_smi_details("rocm_smi_showrasinfo", path_resolver) - env_map["rocm_smi_showxgmierr"] = print_rocm_smi_details("rocm_smi_showxgmierr", path_resolver) - env_map["rocm_smi_clocks"] = print_rocm_smi_details("rocm_smi_clocks", path_resolver) - env_map["rocm_smi_showcompute_partition"] = print_rocm_smi_details("rocm_smi_showcompute_partition", path_resolver) - env_map["rocm_smi_nodesbwi"] = print_rocm_smi_details("rocm_smi_nodesbw", path_resolver) - env_map["rocm_smi_gpudeviceid"] = print_rocm_smi_details("rocm_smi_gpudeviceid", path_resolver) + env_map["rocm_smi_showhw"] = print_rocm_smi_details( + "rocm_smi_showhw", path_resolver + ) + env_map["rocm_smi_pcie"] = print_rocm_smi_details( + "rocm_smi_pcie", path_resolver + ) + env_map["rocm_smi_pids"] = print_rocm_smi_details( + "rocm_smi_pids", path_resolver + ) + env_map["rocm_smi_topology"] = print_rocm_smi_details( + "rocm_smi_topology", path_resolver + ) + env_map["rocm_smi_showserial"] = print_rocm_smi_details( + "rocm_smi_showserial", path_resolver + ) + env_map["rocm_smi_showperflevel"] = print_rocm_smi_details( + "rocm_smi_showperflevel", path_resolver + ) + env_map["rocm_smi_showrasinfo"] = print_rocm_smi_details( + "rocm_smi_showrasinfo", path_resolver + ) + env_map["rocm_smi_showxgmierr"] = print_rocm_smi_details( + "rocm_smi_showxgmierr", path_resolver + ) + env_map["rocm_smi_clocks"] = print_rocm_smi_details( + "rocm_smi_clocks", path_resolver + ) + env_map["rocm_smi_showcompute_partition"] = print_rocm_smi_details( + "rocm_smi_showcompute_partition", path_resolver + ) + env_map["rocm_smi_nodesbwi"] = print_rocm_smi_details( + "rocm_smi_nodesbw", path_resolver + ) + env_map["rocm_smi_gpudeviceid"] = print_rocm_smi_details( + "rocm_smi_gpudeviceid", path_resolver + ) env_map["rocm_info"] = print_rocm_info_details(path_resolver) elif gpu_device_type == "NVIDIA": env_map["cuda_information"] = print_cuda_version_information() env_map["cuda_env_variables"] = print_cuda_env_variables() env_map["cuda_packages_installed"] = print_cuda_packages_installed() - + env_map["pip_list"] = print_pip_list_details() if os.path.exists("/proc/sys/kernel/numa_balancing"): @@ -749,22 +826,22 @@ def generate_env_info(gpu_device_type, path_resolver): def main(): # Initialize path resolver path_resolver = RocmPathResolver(verbose=args.verbose) - + # Detect GPU type with resolver gpu_device_type = determine_gpu_device_type(path_resolver) - + # Generate environment info generate_env_info(gpu_device_type, path_resolver) - + # Get configs configs = env_map.keys() if args.lite: configs = parse_env_tags_json("env_tags.json") - + # Dump system environment information dump_system_env_information(configs, args.output_name) print(f"OK: finished dumping the system env details in .{args.output_name} folder") - + # CSV output if args.dump_csv or args.print_csv: csv_file = args.output_name + ".csv" @@ -775,21 +852,31 @@ def main(): csv_parser.print_csv_output() -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser( description="System environment data collection tool (TheRock + Traditional ROCm compatible)" ) - parser.add_argument("--lite", action="store_true", - help="System environment data lite version taken from env_tags.json") - parser.add_argument("--dump-csv", action="store_true", - help="Dump system config info in CSV file") - parser.add_argument("--print-csv", action="store_true", - help="Print system config info data") - parser.add_argument("--output-name", required=False, default="sys_config_info", - help="Output file or directory name") - parser.add_argument("-v", "--verbose", action="store_true", - help="Enable verbose detection output") - + parser.add_argument( + "--lite", + action="store_true", + help="System environment data lite version taken from env_tags.json", + ) + parser.add_argument( + "--dump-csv", action="store_true", help="Dump system config info in CSV file" + ) + parser.add_argument( + "--print-csv", action="store_true", help="Print system config info data" + ) + parser.add_argument( + "--output-name", + required=False, + default="sys_config_info", + help="Output file or directory name", + ) + parser.add_argument( + "-v", "--verbose", action="store_true", help="Enable verbose detection output" + ) + args = parser.parse_args() console = Console(shellVerbose=False, live_output=False) diff --git a/src/madengine/scripts/common/pre_scripts/rocEnvTool/test_rocenv.sh b/src/madengine/scripts/common/pre_scripts/rocEnvTool/test_rocenv.sh index a817001e..84c40a01 100644 --- a/src/madengine/scripts/common/pre_scripts/rocEnvTool/test_rocenv.sh +++ b/src/madengine/scripts/common/pre_scripts/rocEnvTool/test_rocenv.sh @@ -101,11 +101,11 @@ echo echo "Test 7: Output directory validation" if [ -d ".test_basic" ]; then pass "Output directory created" - + # Count subdirectories NUM_SECTIONS=$(find .test_basic -mindepth 1 -maxdepth 1 -type d | wc -l) info "Generated $NUM_SECTIONS information sections" - + if [ "$NUM_SECTIONS" -gt 5 ]; then pass "Sufficient sections generated ($NUM_SECTIONS)" else @@ -136,16 +136,16 @@ echo echo "Test 9: ROCm-specific sections" if [ -d ".test_basic/rocm_information" ]; then pass "ROCm information section generated" - + # Check content if [ -f ".test_basic/rocm_information/rocm_information.txt" ]; then CONTENT=$(cat .test_basic/rocm_information/rocm_information.txt) - + if echo "$CONTENT" | grep -q "Installation Type:"; then DETECTED_TYPE=$(echo "$CONTENT" | grep "Installation Type:" | head -1) pass "ROCm installation type detected: $DETECTED_TYPE" fi - + if echo "$CONTENT" | grep -q "ROCm Root:"; then DETECTED_ROOT=$(echo "$CONTENT" | grep "ROCm Root:" | head -1) pass "ROCm root identified: $DETECTED_ROOT" @@ -161,10 +161,10 @@ echo "Test 10: CSV generation" if python3 rocenv_tool_v2.py --output-name test_csv --dump-csv > /dev/null 2>&1; then if [ -f "test_csv.csv" ]; then pass "CSV file generated" - + LINE_COUNT=$(wc -l < test_csv.csv) info "CSV contains $LINE_COUNT lines" - + if [ "$LINE_COUNT" -gt 10 ]; then pass "CSV contains data" fi @@ -231,4 +231,3 @@ echo "- README_v2.md - Usage guide" echo "- THEROCK_COMPATIBILITY.md - Compatibility details" echo "- IMPLEMENTATION_SUMMARY.md - Implementation overview" echo - diff --git a/src/madengine/scripts/common/pre_scripts/run_rocenv_tool.sh b/src/madengine/scripts/common/pre_scripts/run_rocenv_tool.sh index 84879d05..95ac8042 100644 --- a/src/madengine/scripts/common/pre_scripts/run_rocenv_tool.sh +++ b/src/madengine/scripts/common/pre_scripts/run_rocenv_tool.sh @@ -1,8 +1,8 @@ #!/usr/bin/env bash -# +# # Copyright (c) Advanced Micro Devices, Inc. # All rights reserved. -# +# OUTPUT_FILE_NAME=${1:-"sys_config_info"} diff --git a/src/madengine/scripts/common/pre_scripts/trace.sh b/src/madengine/scripts/common/pre_scripts/trace.sh index 5c591c83..8a8a16ca 100644 --- a/src/madengine/scripts/common/pre_scripts/trace.sh +++ b/src/madengine/scripts/common/pre_scripts/trace.sh @@ -1,8 +1,8 @@ #!/usr/bin/env bash -# +# # Copyright (c) Advanced Micro Devices, Inc. # All rights reserved. -# +# set -e set -x @@ -40,7 +40,7 @@ rpd) else echo "rocmProfileData directory already exists, skipping clone" fi - + # Build RPD tracer locally without system install cd ./rocmProfileData # Workaround for upstream rocmProfileData Makefile typo: UStringTable.o -> StringTable.o @@ -52,7 +52,7 @@ rpd) echo "Error: Failed to build RPD tracer" exit 1 fi - + # Install rocpd Python module locally cd rocpd_python python3 setup.py install @@ -61,7 +61,7 @@ rpd) exit 1 fi cd ../.. - + echo "RPD setup completed successfully" ;; diff --git a/src/madengine/scripts/common/test_echo.sh b/src/madengine/scripts/common/test_echo.sh index 01c2830b..f9e3a7e0 100644 --- a/src/madengine/scripts/common/test_echo.sh +++ b/src/madengine/scripts/common/test_echo.sh @@ -1,7 +1,7 @@ #!/bin/bash -# +# # Copyright (c) Advanced Micro Devices, Inc. # All rights reserved. -# +# echo "$@" diff --git a/src/madengine/scripts/common/tools.json b/src/madengine/scripts/common/tools.json index 82869087..ecb2edb7 100644 --- a/src/madengine/scripts/common/tools.json +++ b/src/madengine/scripts/common/tools.json @@ -340,4 +340,4 @@ "post_scripts": [] } } -} \ No newline at end of file +} diff --git a/src/madengine/scripts/common/tools/amd_smi_utils.py b/src/madengine/scripts/common/tools/amd_smi_utils.py index e0e48096..2057b4c7 100644 --- a/src/madengine/scripts/common/tools/amd_smi_utils.py +++ b/src/madengine/scripts/common/tools/amd_smi_utils.py @@ -6,34 +6,33 @@ Copyright (c) Advanced Micro Devices, Inc. All rights reserved. """ -import sys import logging -from typing import List, Optional, Dict, Any +import sys +from typing import Any, Dict, List, Optional sys.path.append("/opt/rocm/libexec/amdsmi_cli/") try: - from amdsmi_init import amdsmi_interface - from amdsmi_init import amdsmi_cli_init, amdsmi_cli_shutdown + from amdsmi_init import amdsmi_cli_init, amdsmi_cli_shutdown, amdsmi_interface except ImportError: raise ImportError("Could not import /opt/rocm/libexec/amdsmi_cli/amdsmi_init.py") class ProfUtils: """Class to get GPU information using AMD amd-smi utility. - + Attributes: amdsmi_initialized: Whether amdsmi interface is initialized. processor_handles: List of GPU processor handles. """ - + def __init__(self, mode) -> None: """Initialize the amd-smi utils class - + @param mode: Mode parameter for compatibility (not used in amd-smi) """ self.amdsmi_initialized = False self.processor_handles = [] - + try: # Initialize amdsmi using the amdsmi_cli_init function amdsmi_cli_init() @@ -41,7 +40,7 @@ def __init__(self, mode) -> None: logging.debug("amdsmi_cli_init() successful") except Exception as e: raise ImportError(f"Failed to initialize amd-smi interface: {e}") - + try: # Get processor handles (GPU devices) self.processor_handles = amdsmi_interface.amdsmi_get_processor_handles() @@ -53,37 +52,37 @@ def __init__(self, mode) -> None: def get_power(self, device: int) -> str: """Get current socket power of a given device. - + Args: device: GPU device index. - + Returns: Power consumption in watts as string, or 'N/A' if unavailable. """ try: if device >= len(self.processor_handles): - return 'N/A' - + return "N/A" + processor_handle = self.processor_handles[device] power_info = amdsmi_interface.amdsmi_get_power_info(processor_handle) - + # power_info is a dict with keys like 'current_socket_power', 'average_socket_power', etc. # Values are in milliwatts, convert to watts - if 'current_socket_power' in power_info: - power_mw = power_info['current_socket_power'] + if "current_socket_power" in power_info: + power_mw = power_info["current_socket_power"] return str(float(power_mw) / 1000.0) - elif 'average_socket_power' in power_info: - power_mw = power_info['average_socket_power'] + elif "average_socket_power" in power_info: + power_mw = power_info["average_socket_power"] return str(float(power_mw) / 1000.0) - - return 'N/A' + + return "N/A" except Exception as e: logging.debug(f"Failed to get power for device {device}: {e}") - return 'N/A' + return "N/A" def list_devices(self) -> List[int]: """Get list of GPU device indices. - + Returns: List of device indices. """ @@ -92,29 +91,33 @@ def list_devices(self) -> List[int]: def get_mem_info(self, device: int) -> float: """Get memory usage percentage for a device. - + Args: device: GPU device index. - + Returns: Memory usage percentage as float. """ try: if device >= len(self.processor_handles): return 0.0 - + processor_handle = self.processor_handles[device] - + # Try to get VRAM usage directly vram_info = amdsmi_interface.amdsmi_get_gpu_vram_usage(processor_handle) - + # vram_info is a dict with 'vram_used' and 'vram_total' in bytes - if isinstance(vram_info, dict) and 'vram_used' in vram_info and 'vram_total' in vram_info: - used = float(vram_info['vram_used']) - total = float(vram_info['vram_total']) + if ( + isinstance(vram_info, dict) + and "vram_used" in vram_info + and "vram_total" in vram_info + ): + used = float(vram_info["vram_used"]) + total = float(vram_info["vram_total"]) if total > 0: return round((used / total) * 100, 2) - + return 0.0 except Exception as e: logging.debug(f"Failed to get memory info for device {device}: {e}") @@ -122,41 +125,45 @@ def get_mem_info(self, device: int) -> float: def check_if_secondary_die(self, device: int) -> bool: """Check if GPU device is the secondary die in a MCM. - + MI200 device specific feature check. The secondary dies lack power management features. - + Args: device: The device to check. - + Returns: True if secondary die, False otherwise. """ try: if device >= len(self.processor_handles): return False - + processor_handle = self.processor_handles[device] - + # Check if power management is enabled - secondary dies typically don't have it - is_power_mgmt_enabled = amdsmi_interface.amdsmi_is_gpu_power_management_enabled(processor_handle) + is_power_mgmt_enabled = ( + amdsmi_interface.amdsmi_is_gpu_power_management_enabled( + processor_handle + ) + ) if not is_power_mgmt_enabled: return True - + # Alternative check: get power info and see if it's zero/unavailable try: power_info = amdsmi_interface.amdsmi_get_power_info(processor_handle) if isinstance(power_info, dict): # If both current and average power are 0, it's likely a secondary die - current_power = power_info.get('current_socket_power', -1) - avg_power = power_info.get('average_socket_power', -1) + current_power = power_info.get("current_socket_power", -1) + avg_power = power_info.get("average_socket_power", -1) if current_power == 0 and avg_power == 0: return True except Exception: # If we can't get power info, might be secondary die return True - + return False except Exception as e: logging.debug(f"Failed to check secondary die for device {device}: {e}") - return False \ No newline at end of file + return False diff --git a/src/madengine/scripts/common/tools/detect_therock.sh b/src/madengine/scripts/common/tools/detect_therock.sh index 2e04d2d1..b670cf09 100755 --- a/src/madengine/scripts/common/tools/detect_therock.sh +++ b/src/madengine/scripts/common/tools/detect_therock.sh @@ -1,7 +1,7 @@ #!/bin/sh # # Quick TheRock ROCm Detection Script -# +# # This script checks if TheRock is installed on the system. # TheRock does NOT use apt - it uses Python pip or tarballs. # @@ -26,34 +26,34 @@ echo "" check_therock_path() { path="$1" label="$2" - + if [ ! -d "$path" ]; then return 1 fi - + manifest="$path/share/therock/therock_manifest.json" dist_info="$path/share/therock/dist_info.json" - + if [ -f "$manifest" ]; then printf "${GREEN}✓ Found TheRock installation${NC}\n" echo " Type: $label" echo " Path: $path" - + if [ -f "$dist_info" ]; then targets=$(grep -oP '(?<="dist_amdgpu_targets": ")[^"]*' "$dist_info" 2>/dev/null || echo "unknown") echo " GPU Targets: $targets" fi - + if command -v jq > /dev/null 2>&1; then commit=$(jq -r '.the_rock_commit' "$manifest" 2>/dev/null || echo "unknown") echo " Commit: $commit" fi - + echo "" FOUND=$((FOUND + 1)) return 0 fi - + return 1 } @@ -61,11 +61,11 @@ check_therock_path() { printf "${BLUE}[1] Checking for rocm-sdk command...${NC}\n" if command -v rocm-sdk > /dev/null 2>&1; then printf "${GREEN}✓ Found rocm-sdk command${NC}\n" - + # Get version version=$(rocm-sdk version 2>/dev/null || echo "unknown") echo " Version: $version" - + # Get root path if root_path=$(rocm-sdk path --root 2>/dev/null); then echo " Root: $root_path" @@ -82,7 +82,7 @@ if python3 -c "import rocm_sdk" 2>/dev/null; then version=$(python3 -c "import rocm_sdk; print(rocm_sdk.__version__)" 2>/dev/null || echo "unknown") printf "${GREEN}✓ Found rocm_sdk Python package${NC}\n" echo " Version: $version" - + # Try to find the package path pkg_path=$(python3 -c " import importlib.util @@ -91,7 +91,7 @@ spec = importlib.util.find_spec('_rocm_sdk_core') if spec and spec.origin: print(pathlib.Path(spec.origin).parent) " 2>/dev/null || echo "") - + if [ -n "$pkg_path" ]; then check_therock_path "$pkg_path" "Python Package" fi @@ -132,7 +132,7 @@ if [ -f "version.json" ] && [ -f "CMakeLists.txt" ]; then if grep -q "rocm-version" version.json 2>/dev/null; then printf "${YELLOW}✓ Found TheRock source directory${NC}\n" echo " Path: $(pwd)" - + if [ -d "build/dist" ]; then for dist_dir in build/dist/*; do if [ -d "$dist_dir" ]; then @@ -173,4 +173,3 @@ else echo "More info: https://github.com/ROCm/TheRock/blob/main/RELEASES.md" exit 1 fi - diff --git a/src/madengine/scripts/common/tools/get_library_trace.py b/src/madengine/scripts/common/tools/get_library_trace.py index d011e643..f614d786 100644 --- a/src/madengine/scripts/common/tools/get_library_trace.py +++ b/src/madengine/scripts/common/tools/get_library_trace.py @@ -5,17 +5,17 @@ Copyright (c) Advanced Micro Devices, Inc. All rights reserved. """ -# built-in modules -import sys +import csv import io import os import re -from datetime import datetime -import csv import subprocess -from contextlib import redirect_stdout, redirect_stderr -import typing +# built-in modules +import sys +import typing +from contextlib import redirect_stderr, redirect_stdout +from datetime import datetime # Global variables of the trace mode mode = os.environ.get("TRACE_MODE", "").replace(" ", "").split(",") @@ -24,7 +24,7 @@ mode.append("rocblas_trace") if os.environ.get("HIPBLASLT_TRACE"): - mode.append("hipblaslt_trace") + mode.append("hipblaslt_trace") if os.environ.get("TENSILE_TRACE"): mode.append("tensile_trace") @@ -175,22 +175,20 @@ def process_miopen_trace(output_lines: list) -> bool: class LibraryFilter(object): """Class to filter the library trace information - + This class filters the library trace information based on the mode - + Args: mode: Mode of the trace liveOutput: Boolean value printConfigs: Boolean value """ + def __init__( - self, - mode: str, - liveOutput: bool=False, - printConfigs: bool=False - ) -> None: + self, mode: str, liveOutput: bool = False, printConfigs: bool = False + ) -> None: """Initialize the LibraryFilter class - + Args: mode: Mode of the trace liveOutput: Boolean value @@ -206,17 +204,14 @@ def __init__( self.printConfigs = printConfigs - def write( - self, - data: str - ) -> None: + def write(self, data: str) -> None: """Write the data - + This function writes the data - + Args: data: Data to write - + Returns: None """ @@ -230,8 +225,8 @@ def write( matched |= r_match if "hipblaslt_trace" in mode: - r_match = process_hipblaslt_trace(data.splitlines() ) - matched |= r_match + r_match = process_hipblaslt_trace(data.splitlines()) + matched |= r_match if "tensile_trace" in mode: t_match = process_tensile_trace(data.splitlines()) @@ -256,19 +251,17 @@ def flush(self): def run_command( - commandstring: str, - request_env: typing.Dict[str, str], - outlog: typing.Any - ): + commandstring: str, request_env: typing.Dict[str, str], outlog: typing.Any +): """Run the command - + This function runs the command - + Args: commandstring: Command string request_env: Request environment outlog: Output log - + Returns: None """ @@ -278,20 +271,20 @@ def run_command( # Run subprocess with STDOUT (not PIPE) so output goes directly to our stdout # This avoids buffering issues with nested processes process = subprocess.Popen( - commandstring, - shell=True, + commandstring, + shell=True, env=modified_env, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, # Merge stderr into stdout universal_newlines=True, - bufsize=1 # Line buffered + bufsize=1, # Line buffered ) - + # Stream output line by line for line in process.stdout: outlog.write(line) outlog.flush() - + # Wait for process to complete process.wait() diff --git a/src/madengine/scripts/common/tools/gpu_info_profiler.py b/src/madengine/scripts/common/tools/gpu_info_profiler.py index 111f655d..194fb3ba 100644 --- a/src/madengine/scripts/common/tools/gpu_info_profiler.py +++ b/src/madengine/scripts/common/tools/gpu_info_profiler.py @@ -5,24 +5,25 @@ Copyright (c) Advanced Micro Devices, Inc. All rights reserved. """ +import csv +import datetime +import logging + # built-in modules import multiprocessing -import threading -import time -import datetime +import os +import signal import subprocess import sys -import csv -import os -import logging +import threading +import time import typing -import signal -from typing import Optional, List, Dict, Any +from typing import Any, Dict, List, Optional def check_amd_smi_available() -> bool: """Check if amd-smi command or Python bindings are available. - + Returns: bool: True if amd-smi is available, False otherwise. """ @@ -30,69 +31,74 @@ def check_amd_smi_available() -> bool: try: sys.path.append("/opt/rocm/libexec/amdsmi_cli/") from amdsmi_init import amdsmi_interface + logging.debug("amd-smi Python bindings found at /opt/rocm/libexec/amdsmi_cli/") return True except ImportError: logging.debug("amd-smi Python bindings not found") - + # Fallback to checking command-line tool try: result = subprocess.run( - ['amd-smi', '--version'], - capture_output=True, - text=True, - timeout=10 + ["amd-smi", "--version"], capture_output=True, text=True, timeout=10 ) if result.returncode == 0: logging.debug("amd-smi command-line tool found") return True - except (subprocess.TimeoutExpired, FileNotFoundError, subprocess.SubprocessError) as e: + except ( + subprocess.TimeoutExpired, + FileNotFoundError, + subprocess.SubprocessError, + ) as e: logging.debug(f"amd-smi command not available: {e}") - + return False + def get_rocm_version() -> Optional[float]: """Get ROCm version from system. - + Returns: Optional[float]: ROCm version as major.minor (e.g., 6.1), or None if not detected. """ try: # Try hipconfig --version first (more reliable) result = subprocess.run( - ['hipconfig', '--version'], - capture_output=True, - text=True, - timeout=10 + ["hipconfig", "--version"], capture_output=True, text=True, timeout=10 ) if result.returncode == 0: # example output: 6.1.40092-038397aaa version_str = result.stdout.strip() - version_parts = version_str.split('.')[:2] # Get major.minor - return float('.'.join(version_parts)) + version_parts = version_str.split(".")[:2] # Get major.minor + return float(".".join(version_parts)) except (subprocess.SubprocessError, ValueError, IndexError) as e: logging.debug(f"hipconfig check failed: {e}") - + try: # Fallback to /opt/rocm/.info/version if os.path.exists("/opt/rocm/.info/version"): - result = subprocess.run(['cat', '/opt/rocm/.info/version'], - capture_output=True, text=True, timeout=10) + result = subprocess.run( + ["cat", "/opt/rocm/.info/version"], + capture_output=True, + text=True, + timeout=10, + ) if result.returncode == 0: - version_str = result.stdout.strip().split('-')[0] # Remove build suffix - version_parts = version_str.split('.')[:2] # Get major.minor - return float('.'.join(version_parts)) + version_str = result.stdout.strip().split("-")[0] # Remove build suffix + version_parts = version_str.split(".")[:2] # Get major.minor + return float(".".join(version_parts)) except (IOError, ValueError, IndexError) as e: logging.debug(f"ROCm version file check failed: {e}") - + return None + def detect_gpu_vendor() -> tuple[bool, bool]: """Detect GPU vendor (NVIDIA or AMD/ROCm). - + Returns: tuple[bool, bool]: (is_nvidia, is_rocm) - + Raises: ValueError: If no GPU management tools are found. """ @@ -116,78 +122,96 @@ def detect_gpu_vendor() -> tuple[bool, bool]: def initialize_profiler_utils(is_nvidia: bool, is_rocm: bool) -> Any: """Initialize the appropriate profiler utility based on GPU vendor. - + Args: is_nvidia: Whether NVIDIA GPU is detected. is_rocm: Whether AMD ROCm GPU is detected. - + Returns: Any: The ProfUtils class for the detected GPU vendor. - + Raises: ImportError: If the required profiler utility cannot be imported. """ if is_nvidia: try: from pynvml_utils import ProfUtils + return ProfUtils except ImportError as e: raise ImportError(f"Could not import pynvml_utils.py: {e}") - + # ROCm path: choose between rocm-smi and amd-smi based on version rocm_version = get_rocm_version() use_amd_smi = False - + logging.info(f"Detected ROCm version: {rocm_version}") logging.info(f"amd-smi available: {check_amd_smi_available()}") - + if rocm_version is not None and rocm_version >= 6.4: # ROCm >= 6.4: prefer amd-smi if available if check_amd_smi_available(): use_amd_smi = True logging.info(f"Using amd-smi for ROCm {rocm_version}") else: - logging.warning(f"ROCm {rocm_version} detected but amd-smi not available, using rocm-smi") + logging.warning( + f"ROCm {rocm_version} detected but amd-smi not available, using rocm-smi" + ) else: logging.info(f"ROCm {rocm_version} < 6.4, using rocm-smi") - + if use_amd_smi: try: from amd_smi_utils import ProfUtils + logging.info("Successfully imported amd_smi_utils") return ProfUtils except ImportError as import_err: # Fallback to rocm-smi if amd-smi import fails - logging.warning(f"amd-smi import failed: {import_err}, falling back to rocm-smi") + logging.warning( + f"amd-smi import failed: {import_err}, falling back to rocm-smi" + ) try: from rocm_smi_utils import ProfUtils + return ProfUtils except ImportError as e: - raise ImportError(f"Could not import amd_smi_utils.py or rocm_smi_utils.py: {e}") + raise ImportError( + f"Could not import amd_smi_utils.py or rocm_smi_utils.py: {e}" + ) except Exception as init_err: # Catch initialization errors from amd_smi_utils.__init__ - logging.warning(f"amd-smi initialization failed: {init_err}, falling back to rocm-smi") + logging.warning( + f"amd-smi initialization failed: {init_err}, falling back to rocm-smi" + ) try: from rocm_smi_utils import ProfUtils + return ProfUtils except ImportError as e: - raise ImportError(f"Could not import rocm_smi_utils.py after amd-smi init failed: {e}") + raise ImportError( + f"Could not import rocm_smi_utils.py after amd-smi init failed: {e}" + ) else: # ROCm < 6.4 or amd-smi not available: use rocm-smi try: from rocm_smi_utils import ProfUtils + return ProfUtils except ImportError as e: raise ImportError(f"Could not import rocm_smi_utils.py: {e}") -logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" +) # Module-level GPU detection (performed once at import, but deferred if used as library) IS_NVIDIA: bool = False IS_ROCM: bool = False _GPU_DETECTED: bool = False + def _ensure_gpu_detected() -> None: """Ensure GPU vendor detection has been performed.""" global IS_NVIDIA, IS_ROCM, _GPU_DETECTED @@ -198,12 +222,12 @@ def _ensure_gpu_detected() -> None: def run_command(commandstring: str) -> None: """Run the command string. - + This function runs the command string. - + Args: commandstring (str): The command string to run. - + Raises: subprocess.CalledProcessError: If the command fails. """ @@ -213,10 +237,10 @@ def run_command(commandstring: str) -> None: def run_command0(commandstring: str) -> None: """Run command on GPU device 0. - + Args: commandstring: The command string to run. - + Raises: subprocess.CalledProcessError: If the command fails. """ @@ -228,10 +252,10 @@ def run_command0(commandstring: str) -> None: def run_command1(commandstring: str) -> None: """Run command on GPU device 1. - + Args: commandstring: The command string to run. - + Raises: subprocess.CalledProcessError: If the command fails. """ @@ -243,15 +267,18 @@ def run_command1(commandstring: str) -> None: class EventController(threading.Thread): """Thread to control workload execution and synchronize with profilers. - + Attributes: event: Threading event to signal profiler start/stop. commandstring: The command string to execute. dual_gcd: Whether to run workload on dual GCDs (AMD-specific). """ - def __init__(self, event: threading.Event, commandstring: str, dual_gcd: str, profiler: Any) -> None: + + def __init__( + self, event: threading.Event, commandstring: str, dual_gcd: str, profiler: Any + ) -> None: """Initialize the event controller. - + Args: event: Threading event for synchronization. commandstring: Command to execute. @@ -266,7 +293,7 @@ def __init__(self, event: threading.Event, commandstring: str, dual_gcd: str, pr def run(self) -> None: """Execute workload and control profiler lifecycle. - + Raises: EnvironmentError: If dual GCD mode is requested but not available. """ @@ -275,20 +302,20 @@ def run(self) -> None: time.sleep(1) # Allow profiler to initialize n_devices = len(self.profiler.list_devices()) - + # Dual GCD mode (AMD-specific) if IS_ROCM and n_devices == 2 and self.dual_gcd == "true": logging.info("Running workload on both GCDs") p0 = multiprocessing.Process(target=run_command0, args=[self.commandstring]) p1 = multiprocessing.Process(target=run_command1, args=[self.commandstring]) - + logging.info("Workload starting...") p0.start() p1.start() p0.join() p1.join() logging.info("Workload completed") - + elif IS_ROCM and n_devices != 2 and self.dual_gcd == "true": self.event.clear() raise EnvironmentError( @@ -308,16 +335,19 @@ def run(self) -> None: class ProfilerThread(threading.Thread): """Base thread class for GPU profiling. - + Attributes: data: List of profiling samples collected. devices: List of GPU device IDs to profile. sampling_rate: Time interval between samples (seconds). event: Threading event for synchronization with workload. """ - def __init__(self, devices: List[int], sampling_rate: float, event: threading.Event) -> None: + + def __init__( + self, devices: List[int], sampling_rate: float, event: threading.Event + ) -> None: """Initialize the profiler thread. - + Args: devices: List of GPU device IDs to profile. sampling_rate: Sampling interval in seconds. @@ -331,59 +361,68 @@ def __init__(self, devices: List[int], sampling_rate: float, event: threading.Ev def run(self, prof_fun: Any, header_string: str) -> None: """Execute profiling loop. - + Args: prof_fun: Function to call for getting metric value for a device. header_string: Column header prefix for CSV output. """ self.event.wait() # Wait for workload to start logging.info("Profiler started") - + while self.event.is_set(): now = datetime.datetime.now() row: Dict[str, Any] = {"time": now.strftime("%Y-%m-%d %H:%M:%S.%f")} - + for device_id in self.devices: current_val = prof_fun(device_id) row[f"{header_string}{device_id}"] = current_val - + logging.debug(f"Sample: {row}") self.data.append(row) time.sleep(self.sampling_rate) - + logging.info(f"Profiler stopped. Collected {len(self.data)} samples") class PowerProfiler(ProfilerThread): """Thread for profiling GPU power consumption. - + Attributes: prof_fun: Function to get power metric. header_string: CSV column header prefix. """ - def __init__(self, devices: List[int], sampling_rate: float, event: threading.Event, - profiler: Any, device_filter: str) -> None: + + def __init__( + self, + devices: List[int], + sampling_rate: float, + event: threading.Event, + profiler: Any, + device_filter: str, + ) -> None: """Initialize the power profiler. - + Args: devices: List of GPU device IDs to profile. sampling_rate: Sampling interval in seconds. event: Threading event for synchronization. profiler: GPU profiler utility instance. device_filter: Device filter string ("all" or specific device). - + Raises: ValueError: If a specified device is a secondary die (AMD-specific). """ super().__init__(devices, sampling_rate, event) - + # AMD-specific: Filter out secondary dies if IS_ROCM and device_filter != "all": for device_id in self.devices: if profiler.check_if_secondary_die(device_id): raise ValueError(f"Device {device_id} is a secondary die") elif IS_ROCM and device_filter == "all": - self.devices = [d for d in self.devices if not profiler.check_if_secondary_die(d)] + self.devices = [ + d for d in self.devices if not profiler.check_if_secondary_die(d) + ] self.prof_fun = profiler.get_power self.header_string = "Power(Watt) GPU" @@ -395,15 +434,21 @@ def run(self) -> None: class VRAMProfiler(ProfilerThread): """Thread for profiling GPU VRAM/memory usage. - + Attributes: prof_fun: Function to get memory metric. header_string: CSV column header prefix. """ - def __init__(self, devices: List[int], sampling_rate: float, event: threading.Event, - profiler: Any) -> None: + + def __init__( + self, + devices: List[int], + sampling_rate: float, + event: threading.Event, + profiler: Any, + ) -> None: """Initialize the VRAM profiler. - + Args: devices: List of GPU device IDs to profile. sampling_rate: Sampling interval in seconds. @@ -421,13 +466,13 @@ def run(self) -> None: def main() -> None: """Profile GPU usage during workload execution. - + Reads configuration from environment variables: MODE: "power" or "vram" DEVICE: Comma-separated device IDs or "all" SAMPLING_RATE: Sampling interval in seconds DUAL_GCD: "true" to enable dual GCD mode (AMD-specific) - + Raises: ValueError: If MODE is invalid or required env vars are missing. EnvironmentError: If dual GCD mode is incompatible with system. @@ -441,13 +486,13 @@ def main() -> None: commandstring += f'"{arg}" ' else: commandstring += f"{arg} " - + # Get required environment variables mode = os.environ.get("MODE") device = os.environ.get("DEVICE") sampling_rate_str = os.environ.get("SAMPLING_RATE") dual_gcd = os.environ.get("DUAL_GCD", "false") - + # Validate environment variables if not mode: raise ValueError("MODE environment variable is required") @@ -455,15 +500,15 @@ def main() -> None: raise ValueError("DEVICE environment variable is required") if not sampling_rate_str: raise ValueError("SAMPLING_RATE environment variable is required") - + try: sampling_rate = float(sampling_rate_str) except ValueError: raise ValueError(f"Invalid SAMPLING_RATE: {sampling_rate_str}") - + if mode not in ["power", "vram"]: raise ValueError(f"Invalid MODE: {mode}. Must be 'power' or 'vram'") - + # Initialize profiler utility prof_utils_class = initialize_profiler_utils(IS_NVIDIA, IS_ROCM) try: @@ -475,62 +520,64 @@ def main() -> None: logging.warning("Attempting fallback to rocm-smi") try: from rocm_smi_utils import ProfUtils as RocmSmiProfUtils + profiler = RocmSmiProfUtils(mode) logging.info("Successfully fell back to rocm-smi") except Exception as fallback_err: - raise RuntimeError(f"Failed to initialize both amd-smi and rocm-smi: {e}, {fallback_err}") + raise RuntimeError( + f"Failed to initialize both amd-smi and rocm-smi: {e}, {fallback_err}" + ) else: raise - + # Create synchronization event event = threading.Event() # Parse device list device_list = device.split(",") - + if len(device_list) == 1 and device_list[0] == "all": device_list = profiler.list_devices() elif len(device_list) == 1 and device_list[0].isdigit(): device_list = [int(device_list[0])] else: device_list = [int(d) for d in device_list] - - logging.info(f"Profiling mode: {mode}, devices: {device_list}, sampling rate: {sampling_rate}s") + + logging.info( + f"Profiling mode: {mode}, devices: {device_list}, sampling rate: {sampling_rate}s" + ) # Create threads workload_thread = EventController( - event=event, - commandstring=commandstring, - dual_gcd=dual_gcd, - profiler=profiler + event=event, commandstring=commandstring, dual_gcd=dual_gcd, profiler=profiler ) - + if mode == "power": profiler_thread = PowerProfiler( devices=device_list, sampling_rate=sampling_rate, event=event, profiler=profiler, - device_filter=device + device_filter=device, ) else: # mode == "vram" profiler_thread = VRAMProfiler( devices=device_list, sampling_rate=sampling_rate, event=event, - profiler=profiler + profiler=profiler, ) # Global flag for signal handling shutdown_requested = threading.Event() - + def signal_handler(signum, frame): """Handle SIGTERM/SIGINT to gracefully shutdown.""" logging.info(f"Received signal {signum}, initiating graceful shutdown...") shutdown_requested.set() # Stop the profiler event to signal threads to stop event.clear() - + # Register signal handlers signal.signal(signal.SIGTERM, signal_handler) signal.signal(signal.SIGINT, signal_handler) @@ -538,23 +585,23 @@ def signal_handler(signum, frame): # Execute profiling workload_thread.start() profiler_thread.start() - + # Wait for either workload completion or shutdown signal workload_thread.join(timeout=1) while workload_thread.is_alive() and not shutdown_requested.is_set(): time.sleep(0.1) - + # If shutdown was requested, clear event to stop profiler if shutdown_requested.is_set(): event.clear() logging.info("Shutdown requested, stopping profiler thread...") - + # Wait for profiler thread to finish profiler_thread.join(timeout=5) - + # Write results to CSV output_file = os.environ.get("OUTPUT_FILE", "prof.csv") - + if not profiler_thread.data: logging.warning("No profiling data collected") # Don't exit with error if we got a shutdown signal - this is expected @@ -562,15 +609,17 @@ def signal_handler(signum, frame): sys.exit(1) else: try: - with open(output_file, "w", newline='') as csvfile: - writer = csv.DictWriter(csvfile, fieldnames=profiler_thread.data[0].keys()) + with open(output_file, "w", newline="") as csvfile: + writer = csv.DictWriter( + csvfile, fieldnames=profiler_thread.data[0].keys() + ) writer.writeheader() writer.writerows(profiler_thread.data) logging.info(f"Profiling data written to {output_file}") except IOError as e: logging.error(f"Failed to write output file: {e}") sys.exit(1) - + if __name__ == "__main__": main() diff --git a/src/madengine/scripts/common/tools/pynvml_utils.py b/src/madengine/scripts/common/tools/pynvml_utils.py index e1915895..8c81d941 100644 --- a/src/madengine/scripts/common/tools/pynvml_utils.py +++ b/src/madengine/scripts/common/tools/pynvml_utils.py @@ -6,10 +6,11 @@ Copyright (c) Advanced Micro Devices, Inc. All rights reserved. """ +import logging + # built-in modules import typing -import logging -from typing import Optional, List +from typing import List, Optional # third-party modules import pynvml @@ -17,7 +18,7 @@ class ProfUtils: """Class to get GPU information using NVIDIA pynvml library. - + Attributes: device_count: Number of NVIDIA GPUs detected. handles: List of NVML device handles. @@ -26,10 +27,10 @@ class ProfUtils: def __init__(self, mode: str) -> None: """Initialize the NVIDIA profiler utility. - + Args: mode: Mode parameter for API compatibility (not used for NVIDIA). - + Raises: RuntimeError: If NVML initialization fails. """ @@ -37,15 +38,15 @@ def __init__(self, mode: str) -> None: pynvml.nvmlInit() except pynvml.NVMLError as e: raise RuntimeError(f"Failed to initialize NVML: {e}") - + try: self.device_count = pynvml.nvmlDeviceGetCount() except pynvml.NVMLError as e: raise RuntimeError(f"Failed to get device count: {e}") - + self.handles: List = [] self.device_list: List[int] = [] - + for i in range(self.device_count): try: self.device_list.append(i) @@ -55,17 +56,17 @@ def __init__(self, mode: str) -> None: def get_power(self, device: int) -> str: """Get current power consumption of a GPU device. - + Args: device: GPU device index. - + Returns: Power consumption in watts as string, or 'N/A' if unavailable. """ if device < 0 or device >= len(self.handles): logging.error(f"Invalid device index: {device}") - return 'N/A' - + return "N/A" + try: # nvmlDeviceGetPowerUsage returns milliwatts power_mw = pynvml.nvmlDeviceGetPowerUsage(self.handles[device]) @@ -73,11 +74,11 @@ def get_power(self, device: int) -> str: return str(round(power_watts, 2)) except pynvml.NVMLError as e: logging.debug(f"Failed to get power for device {device}: {e}") - return 'N/A' + return "N/A" def list_devices(self) -> List[int]: """Get list of available GPU device indices. - + Returns: List of device indices. """ @@ -85,17 +86,17 @@ def list_devices(self) -> List[int]: def get_mem_info(self, device: int) -> float: """Get memory usage percentage for a GPU device. - + Args: device: GPU device index. - + Returns: Memory usage percentage as float (0-100). """ if device < 0 or device >= len(self.handles): logging.error(f"Invalid device index: {device}") return 0.0 - + try: info = pynvml.nvmlDeviceGetMemoryInfo(self.handles[device]) if info.total > 0: @@ -108,13 +109,13 @@ def get_mem_info(self, device: int) -> float: def check_if_secondary_die(self, device: int) -> bool: """Check if device is a secondary die. - + This method is provided for API compatibility with AMD utils. NVIDIA GPUs do not have the concept of secondary dies like AMD MCM GPUs. - + Args: device: GPU device index. - + Returns: Always False for NVIDIA GPUs. """ diff --git a/src/madengine/scripts/common/tools/rocm_smi_utils.py b/src/madengine/scripts/common/tools/rocm_smi_utils.py index dd73219b..fdbf9190 100644 --- a/src/madengine/scripts/common/tools/rocm_smi_utils.py +++ b/src/madengine/scripts/common/tools/rocm_smi_utils.py @@ -6,8 +6,8 @@ Copyright (c) Advanced Micro Devices, Inc. All rights reserved. """ -import sys import logging +import sys from typing import List sys.path.append("/opt/rocm/libexec/rocm_smi/") @@ -20,11 +20,12 @@ class ProfUtils: """Class to get GPU information using AMD rocm_smi utility. - + Attributes: rocm6: Whether ROCm 6+ API is available. rocmsmi: ROCm SMI bindings instance. """ + def __init__(self, mode) -> None: self.rocm6 = False try: @@ -32,10 +33,14 @@ def __init__(self, mode) -> None: if rocm_smi.driverInitialized() is True: ret_init = self.rocmsmi.rsmi_init(0) if ret_init != 0: - raise ValueError('ROCm SMI returned %s (the expected value is 0)', ret_init) + raise ValueError( + "ROCm SMI returned %s (the expected value is 0)", ret_init + ) exit(ret_init) else: - raise ImportError('Driver not initialized (amdgpu not found in modules)') + raise ImportError( + "Driver not initialized (amdgpu not found in modules)" + ) exit(0) self.rocm6 = True except Exception: @@ -43,32 +48,32 @@ def __init__(self, mode) -> None: def get_power(self, device: int) -> str: """Get current socket power of a given device. - + Args: device: DRM device identifier. - + Returns: Power consumption in watts as string, or 'N/A' if unavailable. """ if self.rocm6: power = c_uint32() ret = self.rocmsmi.rsmi_dev_power_ave_get(device, 0, byref(power)) - if rocm_smi.rsmi_ret_ok(ret, device, 'get_socket_power', False): + if rocm_smi.rsmi_ret_ok(ret, device, "get_socket_power", False): return str(power.value / 1000000) - return 'N/A' + return "N/A" else: return rocm_smi.getPower(device) def list_devices(self) -> List[int]: """Get list of GPU device indices. - + Returns: List of device indices. """ if self.rocm6: numberOfDevices = c_uint32(0) ret = self.rocmsmi.rsmi_num_monitor_devices(byref(numberOfDevices)) - if rocm_smi.rsmi_ret_ok(ret, metric='get_num_monitbyrefor_devices'): + if rocm_smi.rsmi_ret_ok(ret, metric="get_num_monitbyrefor_devices"): deviceList = list(range(numberOfDevices.value)) return deviceList else: @@ -78,31 +83,33 @@ def list_devices(self) -> List[int]: def get_mem_info(self, device: int) -> float: """Get memory usage percentage for a device. - + Args: device: GPU device index. - + Returns: Memory usage percentage as float. """ if self.rocm6: memoryUse = c_uint64() - ret = self.rocmsmi.rsmi_dev_memory_busy_percent_get(device, byref(memoryUse)) - if rocm_smi.rsmi_ret_ok(ret, device, '% memory use'): + ret = self.rocmsmi.rsmi_dev_memory_busy_percent_get( + device, byref(memoryUse) + ) + if rocm_smi.rsmi_ret_ok(ret, device, "% memory use"): return memoryUse.value else: (memUsed, memTotal) = rocm_smi.getMemInfo(device, "vram") - return round(float(memUsed)/float(memTotal) * 100, 2) + return round(float(memUsed) / float(memTotal) * 100, 2) def check_if_secondary_die(self, device: int) -> bool: """Check if GCD(die) is the secondary die in a MCM. - + MI200 device specific feature check. The secondary dies lack power management features. - + Args: device: The device to check. - + Returns: True if secondary die, False otherwise. """ @@ -112,8 +119,14 @@ def check_if_secondary_die(self, device: int) -> bool: timestamp = c_uint64() # secondary die can be determined by checking if energy counter == 0 - ret = self.rocmsmi.rsmi_dev_energy_count_get(device, byref(energy_count), byref(counter_resoution), byref(timestamp)) - if (rocm_smi.rsmi_ret_ok(ret, None, 'energy_count_secondary_die_check', silent=False)) and (energy_count.value == 0): + ret = self.rocmsmi.rsmi_dev_energy_count_get( + device, byref(energy_count), byref(counter_resoution), byref(timestamp) + ) + if ( + rocm_smi.rsmi_ret_ok( + ret, None, "energy_count_secondary_die_check", silent=False + ) + ) and (energy_count.value == 0): return True return False else: diff --git a/src/madengine/scripts/common/tools/rocprof_counter_csv_to_instruction_histogram.py b/src/madengine/scripts/common/tools/rocprof_counter_csv_to_instruction_histogram.py index 386bf08f..27c44f2b 100644 --- a/src/madengine/scripts/common/tools/rocprof_counter_csv_to_instruction_histogram.py +++ b/src/madengine/scripts/common/tools/rocprof_counter_csv_to_instruction_histogram.py @@ -66,13 +66,19 @@ def parse_csv(path: Path) -> dict: def main() -> int: if len(sys.argv) < 2: - print("Usage: rocprof_counter_csv_to_instruction_histogram.py ", file=sys.stderr) + print( + "Usage: rocprof_counter_csv_to_instruction_histogram.py ", + file=sys.stderr, + ) return 1 out_dir = Path(sys.argv[1]) if not out_dir.is_dir(): return 0 # no dir, skip silently aggregated = {} - for name in ("rocprofv3_output_counter_collection.csv", "rocprofv3_output_domain_stats.csv"): + for name in ( + "rocprofv3_output_counter_collection.csv", + "rocprofv3_output_domain_stats.csv", + ): path = out_dir / name if not path.exists(): continue diff --git a/src/madengine/scripts/common/tools/rocprof_wrapper.sh b/src/madengine/scripts/common/tools/rocprof_wrapper.sh index 995e5c79..755b87e0 100755 --- a/src/madengine/scripts/common/tools/rocprof_wrapper.sh +++ b/src/madengine/scripts/common/tools/rocprof_wrapper.sh @@ -1,8 +1,8 @@ #!/bin/bash -# +# # Copyright (c) Advanced Micro Devices, Inc. # All rights reserved. -# +# # ROCm Profiler Wrapper - Intelligently select between rocprof (legacy) and rocprofv3 (new) # # This wrapper handles the transition from rocprof to rocprofv3 across ROCm versions. @@ -39,17 +39,17 @@ get_rocm_version() { # Try multiple methods to detect ROCm version local version="" - + # Method 1: Check rocm-smi output if command -v rocm-smi &> /dev/null; then version=$(rocm-smi --version 2>/dev/null | grep -oP 'ROCm version: \K[0-9]+\.[0-9]+\.[0-9]+' | head -1) fi - + # Method 2: Check /opt/rocm/.info/version file if [ -z "$version" ] && [ -f /opt/rocm/.info/version ]; then version=$(cat /opt/rocm/.info/version) fi - + # Method 3: Check ROCM_PATH or default ROCm installation if [ -z "$version" ]; then local rocm_path="${ROCM_PATH:-/opt/rocm}" @@ -57,7 +57,7 @@ get_rocm_version() { version=$(cat "$rocm_path/.info/version") fi fi - + echo "$version" } @@ -72,13 +72,13 @@ version_gte() { # Function to detect available profiler detect_profiler() { local rocm_version=$(get_rocm_version) - + # Check if rocprofv3 is available if command -v rocprofv3 &> /dev/null; then echo "rocprofv3" return 0 fi - + # Check if rocprof (legacy) is available if command -v rocprof &> /dev/null; then # For ROCm >= 7.0, warn that rocprofv3 should be available @@ -88,7 +88,7 @@ detect_profiler() { echo "rocprof" return 0 fi - + # No profiler found echo "Error: Neither rocprofv3 nor rocprof found in PATH" >&2 echo "Please ensure ROCm profiler tools are installed" >&2 @@ -99,11 +99,11 @@ detect_profiler() { main() { local profiler=$(detect_profiler) local exit_code=$? - + if [ $exit_code -ne 0 ]; then return 1 fi - + # Execute the detected profiler with all passed arguments if [ "$profiler" = "rocprof" ]; then # Legacy rocprof syntax: rocprof [options] [args] @@ -117,21 +117,21 @@ main() { local profiler_opts=() local app_cmd=() local found_separator=false - + for arg in "$@"; do if [ "$arg" = "--" ]; then # Found the separator, everything after this is the application command found_separator=true continue fi - + if [ "$found_separator" = true ]; then app_cmd+=("$arg") else profiler_opts+=("$arg") fi done - + # Build command with proper argument placement. # Filter known-noisy rocprofv3/generateRocpd stderr: "sql text value for value is empty. Using NULL instead" # (ROCm writes this for every empty string->NULL in the SQLite DB; harmless but floods logs.) @@ -148,4 +148,3 @@ main() { # Run main function main "$@" - diff --git a/src/madengine/scripts/common/tools/therock_detector.py b/src/madengine/scripts/common/tools/therock_detector.py index 557ba55d..97dbf169 100755 --- a/src/madengine/scripts/common/tools/therock_detector.py +++ b/src/madengine/scripts/common/tools/therock_detector.py @@ -41,6 +41,7 @@ def _prepend_madengine_to_sys_path() -> None: therock_manifest_path, ) except ImportError: # pragma: no cover — script copied outside a package tree + def therock_manifest_path(path: Path) -> Path: # keep in sync with therock_markers return path / "share" / "therock" / "therock_manifest.json" @@ -88,7 +89,7 @@ def _add_installation(self, install_type: str, path: Path, details: Dict): "path": str(path.resolve()), "details": details, } - + # Avoid duplicates if not any(inst["path"] == installation["path"] for inst in self.installations): self.installations.append(installation) @@ -97,7 +98,7 @@ def _add_installation(self, install_type: str, path: Path, details: Dict): def _is_therock_installation(self, path: Path) -> Optional[Dict]: """ Check if a path contains TheRock installation markers. - + Returns dict with installation details if TheRock is detected, None otherwise. """ if not path.exists(): @@ -127,7 +128,9 @@ def _is_therock_installation(self, path: Path) -> Optional[Dict]: with open(dist_info_path, "r") as f: dist_info = json.load(f) details["dist_info"] = { - "amdgpu_targets": dist_info.get("dist_amdgpu_targets", "unknown"), + "amdgpu_targets": dist_info.get( + "dist_amdgpu_targets", "unknown" + ), } except Exception as e: self.log(f"Error reading dist_info: {e}") @@ -153,20 +156,20 @@ def _is_therock_installation(self, path: Path) -> Optional[Dict]: # If we found any TheRock markers, return details if details: return details - + return None def _detect_rocm_sdk_command(self): """Detect rocm-sdk command in PATH (indicates pip installation).""" self.log("Checking for rocm-sdk command...") - + rocm_sdk_path = shutil.which("rocm-sdk") if rocm_sdk_path: self.log(f"Found rocm-sdk at: {rocm_sdk_path}") - + # Try to get installation details details = {"command_path": rocm_sdk_path} - + # Get version try: result = subprocess.run( @@ -201,25 +204,26 @@ def _detect_rocm_sdk_command(self): def _detect_python_packages(self): """Detect TheRock Python packages in site-packages.""" self.log("Checking Python site-packages...") - + try: import site import importlib.util - + # Check for rocm_sdk package spec = importlib.util.find_spec("rocm_sdk") if spec and spec.origin: package_path = Path(spec.origin).parent self.log(f"Found rocm_sdk package at: {package_path}") - + # Try to import and get details try: import rocm_sdk + details = { "package_path": str(package_path), "version": getattr(rocm_sdk, "__version__", "unknown"), } - + # Try to get rocm_sdk_core path for TheRock markers core_spec = importlib.util.find_spec("_rocm_sdk_core") if core_spec and core_spec.origin: @@ -230,14 +234,14 @@ def _detect_python_packages(self): self._add_installation("python_package", core_path, details) except Exception as e: self.log(f"Error importing rocm_sdk: {e}") - + except Exception as e: self.log(f"Error checking Python packages: {e}") def _detect_tarball_installations(self): """Detect tarball installations in common paths.""" self.log("Checking common installation paths...") - + # Common installation directories for tarballs common_paths = [ Path.home() / "rocm", @@ -246,7 +250,7 @@ def _detect_tarball_installations(self): Path("/usr/local/rocm"), Path.home() / ".local" / "rocm", ] - + for path in common_paths: if path.exists(): details = self._is_therock_installation(path) @@ -256,13 +260,13 @@ def _detect_tarball_installations(self): def _detect_from_env_vars(self): """Detect TheRock from environment variables.""" self.log("Checking environment variables...") - + env_vars = [ "ROCM_PATH", "ROCM_HOME", "HIP_PATH", ] - + for var in env_vars: value = os.environ.get(var) if value: @@ -277,18 +281,20 @@ def _detect_from_env_vars(self): def _detect_build_directories(self): """Detect local TheRock build directories.""" self.log("Checking for local build directories...") - + # Check current directory and parent directories current = Path.cwd() for _ in range(5): # Check up to 5 levels up # Check for TheRock source indicators - if (current / "CMakeLists.txt").exists() and (current / "version.json").exists(): + if (current / "CMakeLists.txt").exists() and ( + current / "version.json" + ).exists(): try: with open(current / "version.json", "r") as f: version_data = json.load(f) if "rocm-version" in version_data: self.log(f"Found TheRock source at: {current}") - + # Check build directory build_dir = current / "build" if build_dir.exists(): @@ -296,14 +302,20 @@ def _detect_build_directories(self): if dist_dir.exists(): for dist_subdir in dist_dir.iterdir(): if dist_subdir.is_dir(): - details = self._is_therock_installation(dist_subdir) + details = self._is_therock_installation( + dist_subdir + ) if details: details["source_path"] = str(current) - details["rocm_version"] = version_data.get("rocm-version") - self._add_installation("local_build", dist_subdir, details) + details["rocm_version"] = ( + version_data.get("rocm-version") + ) + self._add_installation( + "local_build", dist_subdir, details + ) except Exception as e: self.log(f"Error checking build directory: {e}") - + parent = current.parent if parent == current: break @@ -315,33 +327,33 @@ def format_installation_info(installation: Dict) -> str: lines = [] lines.append(f"\nType: {installation['type']}") lines.append(f"Path: {installation['path']}") - + details = installation.get("details", {}) - + if "version" in details: lines.append(f"Version: {details['version']}") - + if "rocm_version" in details: lines.append(f"ROCm Version: {details['rocm_version']}") - + if "manifest" in details: manifest = details["manifest"] lines.append(f"TheRock Commit: {manifest.get('commit', 'unknown')}") lines.append(f"Submodules: {manifest.get('submodules_count', 0)}") - + if "dist_info" in details: dist_info = details["dist_info"] lines.append(f"GPU Targets: {dist_info.get('amdgpu_targets', 'unknown')}") - + if "binaries" in details: lines.append(f"Compilers: {', '.join(details['binaries'])}") - + if "command_path" in details: lines.append(f"Command: {details['command_path']}") - + if "detected_via" in details: lines.append(f"Detected via: ${details['detected_via']}") - + return "\n".join(lines) @@ -358,7 +370,8 @@ def main(): """, ) parser.add_argument( - "-v", "--verbose", + "-v", + "--verbose", action="store_true", help="Enable verbose output", ) @@ -372,11 +385,11 @@ def main(): type=Path, help="Check specific path for TheRock installation", ) - + args = parser.parse_args() - + detector = TherockDetector(verbose=args.verbose) - + # If specific path provided, check only that if args.path: details = detector._is_therock_installation(args.path) @@ -393,14 +406,14 @@ def main(): else: # Run full detection installations = detector.detect_all() - + # Output results if not installations: print("No TheRock installations detected.") print("\nTheRock uses Python pip packages or tarballs, not apt.") print("See: https://github.com/ROCm/TheRock/blob/main/RELEASES.md") sys.exit(1) - + if args.json: print(json.dumps(installations, indent=2)) else: @@ -408,9 +421,9 @@ def main(): for i, installation in enumerate(installations, 1): print(f"\n{'=' * 60}") print(f"Installation #{i}") - print('=' * 60) + print("=" * 60) print(format_installation_info(installation)) - + print(f"\n{'=' * 60}") print("\nTheRock Installation Info:") print("- TheRock does NOT use apt/system packages") @@ -418,10 +431,9 @@ def main(): print("- Python packages install to venv site-packages") print("- Tarballs extract to custom directories") print("\nFor more info: https://github.com/ROCm/TheRock") - + sys.exit(0) if __name__ == "__main__": main() - diff --git a/src/madengine/scripts/k8s/data/download_aws.sh b/src/madengine/scripts/k8s/data/download_aws.sh index 79a705ff..0017ae38 100755 --- a/src/madengine/scripts/k8s/data/download_aws.sh +++ b/src/madengine/scripts/k8s/data/download_aws.sh @@ -60,4 +60,3 @@ echo "MAD_DATA_DOWNLOAD_DURATION=$DURATION" >> /tmp/mad_metrics.env echo "MAD_DATA_SIZE=$SIZE" >> /tmp/mad_metrics.env echo "MAD_DATA_PROVIDER_TYPE=aws" >> /tmp/mad_metrics.env echo "MAD_DATANAME=$DATANAME" >> /tmp/mad_metrics.env - diff --git a/src/madengine/scripts/k8s/data/download_local.sh b/src/madengine/scripts/k8s/data/download_local.sh index 901af88c..27ad405c 100755 --- a/src/madengine/scripts/k8s/data/download_local.sh +++ b/src/madengine/scripts/k8s/data/download_local.sh @@ -41,4 +41,3 @@ echo "MAD_DATA_DOWNLOAD_DURATION=0" >> /tmp/mad_metrics.env echo "MAD_DATA_SIZE=$SIZE" >> /tmp/mad_metrics.env echo "MAD_DATA_PROVIDER_TYPE=local" >> /tmp/mad_metrics.env echo "MAD_DATANAME=$DATANAME" >> /tmp/mad_metrics.env - diff --git a/src/madengine/scripts/k8s/data/download_minio.sh b/src/madengine/scripts/k8s/data/download_minio.sh index f0da3932..03d66d82 100755 --- a/src/madengine/scripts/k8s/data/download_minio.sh +++ b/src/madengine/scripts/k8s/data/download_minio.sh @@ -79,4 +79,3 @@ echo "MAD_DATA_DOWNLOAD_DURATION=$DURATION" >> /tmp/mad_metrics.env echo "MAD_DATA_SIZE=$SIZE" >> /tmp/mad_metrics.env echo "MAD_DATA_PROVIDER_TYPE=minio" >> /tmp/mad_metrics.env echo "MAD_DATANAME=$DATANAME" >> /tmp/mad_metrics.env - diff --git a/src/madengine/scripts/k8s/data/download_nas.sh b/src/madengine/scripts/k8s/data/download_nas.sh index 45e062d8..5d3e23d8 100755 --- a/src/madengine/scripts/k8s/data/download_nas.sh +++ b/src/madengine/scripts/k8s/data/download_nas.sh @@ -24,14 +24,14 @@ NAS_PASS=${NAS_PASSWORD} # If credentials not in environment, try to read from credential.json if [ -z "$NAS_PASS" ] && [ -f "/workspace/credential.json" ]; then echo "Reading NAS credentials from credential.json..." - + # Extract NAS node info (try first node or find by hostname) NAS_HOST=$(python3 -c "import json; f=open('/workspace/credential.json'); d=json.load(f); nodes=d.get('NAS_NODES', []); print(nodes[0].get('HOST', 'mlse-nas.amd.com') if nodes else 'mlse-nas.amd.com')" 2>/dev/null || echo "mlse-nas.amd.com") - + NAS_PORT=$(python3 -c "import json; f=open('/workspace/credential.json'); d=json.load(f); nodes=d.get('NAS_NODES', []); print(nodes[0].get('PORT', '22') if nodes else '22')" 2>/dev/null || echo "22") - + NAS_USER=$(python3 -c "import json; f=open('/workspace/credential.json'); d=json.load(f); nodes=d.get('NAS_NODES', []); print(nodes[0].get('USERNAME', 'datum') if nodes else 'datum')" 2>/dev/null || echo "datum") - + NAS_PASS=$(python3 -c "import json; f=open('/workspace/credential.json'); d=json.load(f); nodes=d.get('NAS_NODES', []); print(nodes[0].get('PASSWORD', '') if nodes else '')" 2>/dev/null || echo "") fi @@ -86,4 +86,3 @@ echo "MAD_DATA_DOWNLOAD_DURATION=$DURATION" >> /tmp/mad_metrics.env echo "MAD_DATA_SIZE=$SIZE" >> /tmp/mad_metrics.env echo "MAD_DATA_PROVIDER_TYPE=nas" >> /tmp/mad_metrics.env echo "MAD_DATANAME=$DATANAME" >> /tmp/mad_metrics.env - diff --git a/src/madengine/scripts/k8s/tools.json b/src/madengine/scripts/k8s/tools.json index c7a3398e..874183f9 100644 --- a/src/madengine/scripts/k8s/tools.json +++ b/src/madengine/scripts/k8s/tools.json @@ -1,7 +1,7 @@ { "_comment": "madengine K8s Tools Configuration", "_description": "Configuration for K8s-specific tools and data providers", - + "data_providers": { "minio": { "script": "scripts/k8s/data/download_minio.sh", @@ -31,7 +31,7 @@ "env_vars": {} } }, - + "wrappers": { "gpu_profiler": { "script": "scripts/k8s/wrappers/run_profiler.sh", @@ -48,7 +48,7 @@ "env_vars": {} } }, - + "shared_tools": { "_note": "These tools from scripts/common/ work directly in K8s without wrappers", "tools": [ @@ -78,7 +78,7 @@ } ] }, - + "pre_scripts": [ { "name": "gpu_info_pre", @@ -87,7 +87,7 @@ "description": "Pre-execution GPU status check" } ], - + "post_scripts": [ { "name": "gpu_info_post", @@ -97,4 +97,3 @@ } ] } - diff --git a/src/madengine/scripts/k8s/wrappers/run_profiler.sh b/src/madengine/scripts/k8s/wrappers/run_profiler.sh index 17bd125c..392e85ac 100755 --- a/src/madengine/scripts/k8s/wrappers/run_profiler.sh +++ b/src/madengine/scripts/k8s/wrappers/run_profiler.sh @@ -47,4 +47,3 @@ if [ -d "$OUTPUT_DIR" ]; then echo "Output files:" ls -lh $OUTPUT_DIR fi - diff --git a/src/madengine/scripts/k8s/wrappers/run_rocenv.sh b/src/madengine/scripts/k8s/wrappers/run_rocenv.sh index c26ad9d5..d9324f66 100755 --- a/src/madengine/scripts/k8s/wrappers/run_rocenv.sh +++ b/src/madengine/scripts/k8s/wrappers/run_rocenv.sh @@ -57,4 +57,3 @@ echo "Results saved to: /workspace/$OUTPUT_NAME.csv" if [ -f "/workspace/$OUT_CSV" ]; then echo "CSV file size: $(du -h /workspace/$OUT_CSV | cut -f1)" fi - diff --git a/src/madengine/scripts/slurm/epilog.sh b/src/madengine/scripts/slurm/epilog.sh index 6f7b68e2..ab707264 100644 --- a/src/madengine/scripts/slurm/epilog.sh +++ b/src/madengine/scripts/slurm/epilog.sh @@ -1,7 +1,7 @@ #!/bin/bash # # SLURM Epilog Script for GPU Cleanup -# +# # This script should be installed on SLURM compute nodes to ensure # GPU processes are properly cleaned up after each job. # @@ -27,14 +27,14 @@ log_message "=== Epilog script starting ===" # Function to kill GPU processes cleanup_gpu_processes() { log_message "Checking for GPU processes..." - + # Try AMD GPUs first if [ -x /opt/rocm/bin/amd-smi ]; then log_message "Detected AMD ROCm installation, checking for processes..." - + # Get PIDs using amd-smi PIDS=$(amd-smi process 2>/dev/null | grep -v PID | awk '{print $1}' | grep -E '^[0-9]+$' | sort -u) - + if [ ! -z "$PIDS" ]; then log_message "Found GPU processes to clean: $PIDS" for pid in $PIDS; do @@ -47,7 +47,7 @@ cleanup_gpu_processes() { else log_message "No GPU processes found via amd-smi" fi - + # Try fuser on GPU devices as backup for device in /dev/kfd /dev/dri/renderD*; do if [ -e "$device" ]; then @@ -64,13 +64,13 @@ cleanup_gpu_processes() { fi done fi - + # Try NVIDIA GPUs if [ -x /usr/bin/nvidia-smi ]; then log_message "Detected NVIDIA GPU installation, checking for processes..." - + PIDS=$(nvidia-smi --query-compute-apps=pid --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+$') - + if [ ! -z "$PIDS" ]; then log_message "Found NVIDIA GPU processes to clean: $PIDS" for pid in $PIDS; do @@ -89,7 +89,7 @@ cleanup_gpu_processes() { # Function to kill Ray processes cleanup_ray_processes() { log_message "Cleaning up Ray processes..." - + # Kill Ray worker processes RAY_PIDS=$(pgrep -f "ray::" 2>/dev/null || true) if [ ! -z "$RAY_PIDS" ]; then @@ -99,7 +99,7 @@ cleanup_ray_processes() { else log_message "No Ray processes found" fi - + # Kill vLLM worker processes VLLM_PIDS=$(pgrep -f "RayWorkerWrapper" 2>/dev/null || true) if [ ! -z "$VLLM_PIDS" ]; then @@ -109,7 +109,7 @@ cleanup_ray_processes() { else log_message "No vLLM worker processes found" fi - + # Kill any vllm processes VLLM_MAIN_PIDS=$(pgrep -f "vllm" 2>/dev/null || true) if [ ! -z "$VLLM_MAIN_PIDS" ]; then @@ -123,7 +123,7 @@ cleanup_ray_processes() { cleanup_docker_containers() { if command -v docker &> /dev/null; then log_message "Checking for stale Docker containers..." - + # Find containers that might be from madengine CONTAINERS=$(docker ps -q --filter "name=container_rocm" 2>/dev/null || true) if [ ! -z "$CONTAINERS" ]; then @@ -142,13 +142,13 @@ cleanup_docker_containers() { # Function to reset GPU state reset_gpu_state() { log_message "Resetting GPU state..." - + # AMD GPU reset if [ -x /opt/rocm/bin/rocm-smi ]; then log_message "Resetting AMD GPUs..." /opt/rocm/bin/rocm-smi --gpureset 2>/dev/null || log_message "GPU reset failed (may require reboot)" fi - + # NVIDIA GPU reset (requires nvidia-smi) if [ -x /usr/bin/nvidia-smi ]; then log_message "Resetting NVIDIA GPUs..." @@ -175,4 +175,3 @@ cleanup_gpu_processes log_message "=== Epilog script completed ===" exit 0 - diff --git a/src/madengine/utils/__init__.py b/src/madengine/utils/__init__.py index 8281537a..948d9796 100644 --- a/src/madengine/utils/__init__.py +++ b/src/madengine/utils/__init__.py @@ -4,8 +4,12 @@ Utility modules for madengine including GPU configuration resolution and config parsing. """ -from .gpu_config import GPUConfigResolver, resolve_runtime_gpus from .config_parser import ConfigParser, get_config_parser +from .gpu_config import GPUConfigResolver, resolve_runtime_gpus -__all__ = ["GPUConfigResolver", "resolve_runtime_gpus", "ConfigParser", "get_config_parser"] - +__all__ = [ + "GPUConfigResolver", + "resolve_runtime_gpus", + "ConfigParser", + "get_config_parser", +] diff --git a/src/madengine/utils/config_parser.py b/src/madengine/utils/config_parser.py index 04e71f9c..585b013a 100644 --- a/src/madengine/utils/config_parser.py +++ b/src/madengine/utils/config_parser.py @@ -10,10 +10,10 @@ Copyright (c) Advanced Micro Devices, Inc. All rights reserved. """ -import os -import re import json import logging +import os +import re import typing from pathlib import Path @@ -21,6 +21,7 @@ try: import yaml + YAML_AVAILABLE = True except ImportError: YAML_AVAILABLE = False @@ -30,111 +31,109 @@ class ConfigParser: """Parser for model configuration files. - + This class handles parsing configuration files in various formats (CSV, JSON, YAML) that are referenced in model arguments. - + Supports three usage patterns when run from MAD-internal CI: 1. MAD-internal models: ./scripts/model/configs/ 2. MAD submodule: ./scripts/MAD/model/configs/ 3. MAD-private submodule: ./scripts/MAD-private/model/configs/ - + Also works when run standalone in MAD or MAD-private repos. """ - + # Known repository/submodule names to detect - KNOWN_REPOS = ['MAD', 'MAD-private', 'MAD-internal'] - + KNOWN_REPOS = ["MAD", "MAD-private", "MAD-internal"] + def __init__(self, scripts_base_dir: typing.Optional[str] = None): """Initialize ConfigParser. - + Args: - scripts_base_dir: Base directory for scripts + scripts_base_dir: Base directory for scripts (e.g., "scripts/MAD-private/pyt_atom") """ self.scripts_base_dir = scripts_base_dir self._path_cache = {} # Cache resolved paths - + def _extract_repo_root(self, path: str) -> typing.Optional[str]: """Extract repository root from a scripts path. - + Examples: "scripts/MAD-private/pyt_atom" -> "scripts/MAD-private" "scripts/MAD/vllm" -> "scripts/MAD" "scripts/model" -> "scripts" - + Args: path: Full or partial path containing scripts directory - + Returns: Repository root path, or None if not identifiable """ if not path: return None - + parts = Path(path).parts - + # Find 'scripts' in the path try: - scripts_idx = parts.index('scripts') + scripts_idx = parts.index("scripts") except ValueError: return None - + # Check if next part after 'scripts' is a known repo name if scripts_idx + 1 < len(parts): next_part = parts[scripts_idx + 1] if next_part in self.KNOWN_REPOS: # It's a submodule: scripts/MAD-private or scripts/MAD - return os.path.join(*parts[:scripts_idx + 2]) + return os.path.join(*parts[: scripts_idx + 2]) else: # It's MAD-internal's own models: scripts/model -> scripts - return os.path.join(*parts[:scripts_idx + 1]) - + return os.path.join(*parts[: scripts_idx + 1]) + # Just 'scripts' directory - return os.path.join(*parts[:scripts_idx + 1]) - + return os.path.join(*parts[: scripts_idx + 1]) + def _build_candidate_paths( - self, - config_path: str, - model_scripts_path: str = None + self, config_path: str, model_scripts_path: str = None ) -> typing.List[str]: """Build list of candidate paths to try in priority order. - + Args: config_path: Relative config path (e.g., "configs/default.csv") model_scripts_path: Path to model script file - + Returns: List of full paths to try, in order of priority """ candidates = [] - + # Priority 1: Relative to model's immediate directory # scripts/MAD-private/pyt_atom + configs/default.csv if model_scripts_path: scripts_dir = os.path.dirname(model_scripts_path) if scripts_dir: candidates.append(os.path.join(scripts_dir, config_path)) - + # Priority 2: Relative to scripts_base_dir # scripts/MAD-private/pyt_atom + configs/default.csv if self.scripts_base_dir: candidates.append(os.path.join(self.scripts_base_dir, config_path)) - + # Priority 3: Relative to repository root (for shared configs) # This handles: scripts/MAD-private/pyt_atom -> scripts/MAD-private/configs/ if self.scripts_base_dir: repo_root = self._extract_repo_root(self.scripts_base_dir) if repo_root: candidates.append(os.path.join(repo_root, config_path)) - + if model_scripts_path: scripts_dir = os.path.dirname(model_scripts_path) if scripts_dir: repo_root = self._extract_repo_root(scripts_dir) if repo_root: candidates.append(os.path.join(repo_root, config_path)) - + # Priority 4: Walk up from model's directory # Try parent directories up to repo root if model_scripts_path: @@ -144,7 +143,7 @@ def _build_candidate_paths( candidates.extend( self._walk_up_between(config_path, scripts_dir, repo_root) ) - + # Priority 5: Walk up from scripts_base_dir if self.scripts_base_dir: repo_root = self._extract_repo_root(self.scripts_base_dir) @@ -152,7 +151,7 @@ def _build_candidate_paths( candidates.extend( self._walk_up_between(config_path, self.scripts_base_dir, repo_root) ) - + # Remove duplicates while preserving order seen = set() unique_candidates = [] @@ -161,29 +160,26 @@ def _build_candidate_paths( if normalized not in seen: seen.add(normalized) unique_candidates.append(normalized) - + return unique_candidates - + def _walk_up_between( - self, - config_path: str, - start_dir: str, - stop_dir: str + self, config_path: str, start_dir: str, stop_dir: str ) -> typing.List[str]: """Generate candidate paths by walking up from start to stop directory. - + Args: config_path: Relative config path start_dir: Starting directory stop_dir: Stop at this directory (inclusive) - + Returns: List of candidate paths """ candidates = [] current = os.path.abspath(start_dir) stop = os.path.abspath(stop_dir) - + while current == stop or current.startswith(stop + os.sep): parent = os.path.dirname(current) if parent == current: # Reached root @@ -192,46 +188,44 @@ def _walk_up_between( candidates.append(os.path.join(current, config_path)) if current == stop: # Reached stop directory break - + return candidates - + def parse_config_from_args( - self, - args_string: str, - model_scripts_path: str = None + self, args_string: str, model_scripts_path: str = None ) -> typing.Optional[str]: """Extract and resolve config file path from model arguments. - + Resolution strategy: 1. If absolute path -> verify it exists 2. Try model's immediate directory 3. Try scripts_base_dir 4. Try repository root (scripts/MAD-private/, scripts/MAD/, scripts/) 5. Walk up from model directory to repo root - + This handles all cases: - MAD-internal models: scripts/model/configs/default.csv - MAD submodule: scripts/MAD/model/configs/default.csv - MAD-private submodule: scripts/MAD-private/model/configs/default.csv - Shared configs at repo level: scripts/MAD-private/configs/default.csv - + Args: args_string: The args field from models.json model_scripts_path: Path to the model's script file (e.g., run.py) - + Returns: Full path to config file, or None if not found """ if not args_string: return None - + # Look for --config argument - config_match = re.search(r'--config\s+([^\s]+)', args_string) + config_match = re.search(r"--config\s+([^\s]+)", args_string) if not config_match: return None - + config_path = config_match.group(1) - + # Check cache first cache_key = f"{config_path}::{model_scripts_path}::{self.scripts_base_dir}" if cache_key in self._path_cache: @@ -240,7 +234,7 @@ def parse_config_from_args( return cached_path else: del self._path_cache[cache_key] - + # Handle absolute paths if os.path.isabs(config_path): if os.path.exists(config_path): @@ -249,17 +243,17 @@ def parse_config_from_args( else: LOGGER.warning(f"Absolute config path does not exist: {config_path}") return None - + # Build and try candidate paths candidates = self._build_candidate_paths(config_path, model_scripts_path) - + for candidate in candidates: LOGGER.debug(f"Trying config path: {candidate}") if os.path.exists(candidate): LOGGER.info(f"Found config file at: {candidate}") self._path_cache[cache_key] = candidate return candidate - + # Not found LOGGER.warning( f"Config file not found: {config_path}\n" @@ -270,16 +264,15 @@ def parse_config_from_args( + (f"\n ... and {len(candidates)-5} more" if len(candidates) > 5 else "") ) return None - + def load_config_file( - self, - config_path: str + self, config_path: str ) -> typing.Optional[typing.Union[typing.List[dict], dict]]: """Load and parse a configuration file. - + Args: config_path: Full path to the config file - + Returns: For CSV: List of dicts (one per row, excluding empty rows) For JSON/YAML: Dict or list as-is from file @@ -287,15 +280,15 @@ def load_config_file( """ if not config_path or not os.path.exists(config_path): return None - + file_ext = Path(config_path).suffix.lower() - + try: - if file_ext == '.csv': + if file_ext == ".csv": return self._load_csv(config_path) - elif file_ext == '.json': + elif file_ext == ".json": return self._load_json(config_path) - elif file_ext in ['.yaml', '.yml']: + elif file_ext in [".yaml", ".yml"]: return self._load_yaml(config_path) else: LOGGER.warning(f"Unsupported config file format: {file_ext}") @@ -303,133 +296,137 @@ def load_config_file( except Exception as e: LOGGER.error(f"Error loading config file {config_path}: {e}") return None - + def _load_csv(self, config_path: str) -> typing.List[dict]: """Load CSV config file. - + Args: config_path: Path to CSV file - + Returns: List of dicts, one per row (excluding completely empty rows) """ df = pd.read_csv(config_path) - + # Remove rows that are completely empty (all NaN) # This handles blank lines in CSV files - df = df.dropna(how='all') - + df = df.dropna(how="all") + # Convert NaN to None for JSON serialization df = df.where(pd.notnull(df), None) - + # Convert to list of dicts - configs = df.to_dict(orient='records') - + configs = df.to_dict(orient="records") + LOGGER.info(f"Loaded {len(configs)} config entries from {config_path}") - + return configs - + def _load_json(self, config_path: str) -> typing.Union[dict, list]: """Load JSON config file. - + Args: config_path: Path to JSON file - + Returns: Dict or list from JSON file """ - with open(config_path, 'r') as f: + with open(config_path, "r") as f: return json.load(f) - + def _load_yaml(self, config_path: str) -> typing.Union[dict, list]: """Load YAML config file. - + Args: config_path: Path to YAML file - + Returns: Dict or list from YAML file """ if not YAML_AVAILABLE: raise ImportError("PyYAML is not installed. Cannot load YAML config files.") - - with open(config_path, 'r') as f: + + with open(config_path, "r") as f: return yaml.safe_load(f) - + def match_config_to_result( - self, - configs_list: typing.List[dict], - result_data: dict, - model_name: str + self, configs_list: typing.List[dict], result_data: dict, model_name: str ) -> typing.Optional[dict]: """Match a specific result to its corresponding config. - + For CSV configs with multiple rows (like vllm), match based on model name and other identifiable fields. - + Args: configs_list: List of config dicts (from CSV rows) result_data: Single result row data model_name: The model name from result - + Returns: Matching config dict, or None if no match found """ if not configs_list: return None - + # For single config, return it if len(configs_list) == 1: return configs_list[0] - + # For multiple configs, try to match based on common fields for config in configs_list: # Try to match on 'model' field if it exists in both - if 'model' in config and 'model' in result_data: + if "model" in config and "model" in result_data: # Compare normalized versions - config_model = str(config['model']).replace('/', '_').replace('-', '_').lower() - result_model = str(result_data['model']).replace('/', '_').replace('-', '_').lower() + config_model = ( + str(config["model"]).replace("/", "_").replace("-", "_").lower() + ) + result_model = ( + str(result_data["model"]) + .replace("/", "_") + .replace("-", "_") + .lower() + ) if config_model in result_model or result_model in config_model: # Additional checks for benchmark type if available - if 'benchmark' in config and 'benchmark' in result_data: - if config['benchmark'] == result_data['benchmark']: + if "benchmark" in config and "benchmark" in result_data: + if config["benchmark"] == result_data["benchmark"]: return config else: return config - + # If no match found, return first config as fallback - LOGGER.warning(f"Could not match config for result: {model_name}. Using first config.") + LOGGER.warning( + f"Could not match config for result: {model_name}. Using first config." + ) return configs_list[0] - + def parse_and_load( - self, - args_string: str, - model_scripts_path: str = None + self, args_string: str, model_scripts_path: str = None ) -> typing.Optional[typing.Union[typing.List[dict], dict]]: """Parse config path from args and load the config file. - + Convenience method that combines parse_config_from_args and load_config_file. - + Args: args_string: The args field from models.json model_scripts_path: Path to the model's script file - + Returns: Config data (list of dicts for CSV, dict for JSON/YAML), or None """ config_path = self.parse_config_from_args(args_string, model_scripts_path) if not config_path: return None - + return self.load_config_file(config_path) def get_config_parser(scripts_base_dir: typing.Optional[str] = None) -> ConfigParser: """Factory function to create a ConfigParser instance. - + Args: scripts_base_dir: Base directory for scripts - + Returns: ConfigParser instance """ diff --git a/src/madengine/utils/discover_models.py b/src/madengine/utils/discover_models.py index fe795e7b..6ae52b32 100644 --- a/src/madengine/utils/discover_models.py +++ b/src/madengine/utils/discover_models.py @@ -5,11 +5,12 @@ # built-in modules import argparse -import os -import json import importlib.util +import json +import os import typing -from dataclasses import dataclass, field, asdict +from dataclasses import asdict, dataclass, field + from rich.console import Console as RichConsole @@ -73,51 +74,57 @@ def _setup_model_dir_if_needed(self) -> None: This copies docker/, scripts/, and config files (models.json, credential.json, data.json) from MODEL_DIR to the current working directory to support the model discovery process. This operation is safe for build-only (CPU) nodes as it only involves file operations. - + MODEL_DIR defaults to "." (current directory) if not set. Only copies if MODEL_DIR points to a different directory than current working directory. """ model_dir_env = os.environ.get("MODEL_DIR", ".") - + # Get absolute paths to compare model_dir_abs = os.path.abspath(model_dir_env) cwd_abs = os.path.abspath(".") - + # Only copy if MODEL_DIR points to a different directory (not current dir) if model_dir_abs != cwd_abs: import shlex import subprocess from pathlib import Path - self.rich_console.print(f"[bold cyan]📁 MODEL_DIR environment variable detected:[/bold cyan] [yellow]{model_dir_env}[/yellow]") + self.rich_console.print( + f"[bold cyan]📁 MODEL_DIR environment variable detected:[/bold cyan] [yellow]{model_dir_env}[/yellow]" + ) print(f"Copying required files to current working directory: {cwd_abs}") try: # Check if source directory exists if not os.path.exists(model_dir_env): - self.rich_console.print(f"[yellow]⚠️ Warning: MODEL_DIR path does not exist: {model_dir_env}[/yellow]") + self.rich_console.print( + f"[yellow]⚠️ Warning: MODEL_DIR path does not exist: {model_dir_env}[/yellow]" + ) return # Copy specific directories and files only (not everything with /*) # This prevents copying unwanted subdirectories from MODEL_DIR items_to_copy = [] - + # Directories to copy for subdir in ["docker", "scripts"]: src_path = Path(model_dir_env) / subdir if src_path.exists(): items_to_copy.append((src_path, subdir, "directory")) - + # Files to copy for file in ["models.json", "credential.json", "data.json"]: src_file = Path(model_dir_env) / file if src_file.exists(): items_to_copy.append((src_file, file, "file")) - + if not items_to_copy: - self.rich_console.print(f"[yellow]⚠️ No required files/directories found in MODEL_DIR[/yellow]") + self.rich_console.print( + f"[yellow]⚠️ No required files/directories found in MODEL_DIR[/yellow]" + ) return - + # Copy each item copied_count = 0 for src_path, item_name, item_type in items_to_copy: @@ -127,7 +134,7 @@ def _setup_model_dir_if_needed(self) -> None: cmd, shell=True, capture_output=True, text=True, check=True ) copied_count += 1 - + if result.stdout: # Show summary for directories, full output for files if item_type == "directory": @@ -135,21 +142,29 @@ def _setup_model_dir_if_needed(self) -> None: if len(lines) < 10: print(result.stdout) else: - print(f" ✓ Copied {item_name}/ ({len(lines)} files)") + print( + f" ✓ Copied {item_name}/ ({len(lines)} files)" + ) else: print(f" ✓ Copied {item_name}") except subprocess.CalledProcessError as e: - self.rich_console.print(f"[yellow]⚠️ Warning: Failed to copy {item_name}: {e}[/yellow]") + self.rich_console.print( + f"[yellow]⚠️ Warning: Failed to copy {item_name}: {e}[/yellow]" + ) if e.stderr: print(f" Error details: {e.stderr}") # Continue with other items even if one fails - + if copied_count > 0: - self.rich_console.print(f"[green]✅ Successfully copied {copied_count} item(s) from MODEL_DIR[/green]") - + self.rich_console.print( + f"[green]✅ Successfully copied {copied_count} item(s) from MODEL_DIR[/green]" + ) + print(f"Model dir: {model_dir_env} → current dir: {cwd_abs}") except Exception as e: - self.rich_console.print(f"[yellow]⚠️ Warning: Unexpected error copying MODEL_DIR: {e}[/yellow]") + self.rich_console.print( + f"[yellow]⚠️ Warning: Unexpected error copying MODEL_DIR: {e}[/yellow]" + ) # Continue execution even if copy fails def discover_models(self) -> None: @@ -179,7 +194,9 @@ def discover_models(self) -> None: files = os.listdir(root) if "models.json" in files and "get_models_json.py" in files: - self.rich_console.print(f"[red]❌ Both models.json and get_models_json.py found in {root}.[/red]") + self.rich_console.print( + f"[red]❌ Both models.json and get_models_json.py found in {root}.[/red]" + ) raise ValueError( f"Both models.json and get_models_json.py found in {root}." ) @@ -311,7 +328,9 @@ def select_models(self) -> None: custom_model.update_model() dirname = custom_model.name.split("/")[0] custom_model.dockerfile = os.path.normpath( - os.path.join("scripts", dirname, custom_model.dockerfile) + os.path.join( + "scripts", dirname, custom_model.dockerfile + ) ) custom_model.scripts = os.path.normpath( os.path.join("scripts", dirname, custom_model.scripts) @@ -339,7 +358,9 @@ def select_models(self) -> None: custom_model.update_model() dirname = custom_model.name.split("/")[0] custom_model.dockerfile = os.path.normpath( - os.path.join("scripts", dirname, custom_model.dockerfile) + os.path.join( + "scripts", dirname, custom_model.dockerfile + ) ) custom_model.scripts = os.path.normpath( os.path.join("scripts", dirname, custom_model.scripts) @@ -349,7 +370,9 @@ def select_models(self) -> None: tag_models.append(model_dict) if not tag_models: - self.rich_console.print(f"[red]❌ No models found corresponding to the given tag: {tag}[/red]") + self.rich_console.print( + f"[red]❌ No models found corresponding to the given tag: {tag}[/red]" + ) raise ValueError( f"No models found corresponding to the given tag: {tag}" ) @@ -359,11 +382,15 @@ def select_models(self) -> None: def print_models(self) -> None: if self.selected_models: # print selected models using parsed tags and adding backslash-separated extra args - self.rich_console.print(f"[bold green]📋 Selected Models ({len(self.selected_models)} models):[/bold green]") + self.rich_console.print( + f"[bold green]📋 Selected Models ({len(self.selected_models)} models):[/bold green]" + ) print(json.dumps(self.selected_models, indent=4)) else: # print list of all model names - self.rich_console.print(f"[bold cyan]📊 Available Models ({len(self.model_list)} total):[/bold cyan]") + self.rich_console.print( + f"[bold cyan]📊 Available Models ({len(self.model_list)} total):[/bold cyan]" + ) for model_name in self.model_list: print(f" {model_name}") diff --git a/src/madengine/utils/gpu_config.py b/src/madengine/utils/gpu_config.py index 4b3c4143..50029b64 100644 --- a/src/madengine/utils/gpu_config.py +++ b/src/madengine/utils/gpu_config.py @@ -16,7 +16,7 @@ import logging import warnings -from typing import Dict, Any, Optional, Tuple +from typing import Any, Dict, Optional, Tuple logger = logging.getLogger(__name__) @@ -24,20 +24,20 @@ class GPUConfigResolver: """ Resolves GPU count from multiple configuration sources with clear precedence. - + Handles various field names (n_gpus, gpu_count, gpus_per_node) and provides validation to catch configuration mismatches early. """ - + # All recognized field names for GPU count GPU_FIELD_ALIASES = [ "gpus_per_node", # SLURM, preferred standard - "gpu_count", # Kubernetes - "n_gpus", # Legacy model.json - "num_gpus", # Alternative - "ngpus", # Alternative + "gpu_count", # Kubernetes + "n_gpus", # Legacy model.json + "num_gpus", # Alternative + "ngpus", # Alternative ] - + @classmethod def resolve_gpu_count( cls, @@ -48,16 +48,16 @@ def resolve_gpu_count( ) -> Tuple[int, str]: """ Resolve GPU count from multiple sources with clear precedence. - + Args: model_info: Model configuration from models.json deployment_config: Deployment configuration (slurm/k8s section) runtime_override: Runtime override from --additional-context validate: Whether to validate and warn about mismatches - + Returns: Tuple of (gpu_count, source) where source indicates which config was used - + Examples: >>> # Priority 1: Runtime override >>> count, source = GPUConfigResolver.resolve_gpu_count( @@ -67,7 +67,7 @@ def resolve_gpu_count( ... ) >>> count, source (4, 'runtime_override') - + >>> # Priority 2: Deployment config >>> count, source = GPUConfigResolver.resolve_gpu_count( ... model_info={"n_gpus": "1"}, @@ -75,7 +75,7 @@ def resolve_gpu_count( ... ) >>> count, source (8, 'deployment_config.slurm.gpus_per_node') - + >>> # Priority 3: Model definition >>> count, source = GPUConfigResolver.resolve_gpu_count( ... model_info={"n_gpus": "2"} @@ -84,7 +84,7 @@ def resolve_gpu_count( (2, 'model_info.n_gpus') """ sources = [] # Track all sources for validation - + # Priority 1: Runtime override if runtime_override: gpu_count = cls._extract_gpu_count(runtime_override, "runtime_override") @@ -93,31 +93,36 @@ def resolve_gpu_count( if validate: cls._validate_consistency(sources, model_info, deployment_config) return gpu_count, "runtime_override" - + # Priority 2: Deployment-specific config if deployment_config: # Check for SLURM config if "slurm" in deployment_config: gpu_count = cls._extract_gpu_count( - deployment_config["slurm"], - "deployment_config.slurm" + deployment_config["slurm"], "deployment_config.slurm" ) if gpu_count is not None: sources.append(("deployment_config.slurm.gpus_per_node", gpu_count)) if validate: - cls._validate_consistency(sources, model_info, deployment_config) + cls._validate_consistency( + sources, model_info, deployment_config + ) return gpu_count, "deployment_config.slurm.gpus_per_node" - + # Check for K8s config if "k8s" in deployment_config or "kubernetes" in deployment_config: - k8s_config = deployment_config.get("k8s") or deployment_config.get("kubernetes") + k8s_config = deployment_config.get("k8s") or deployment_config.get( + "kubernetes" + ) gpu_count = cls._extract_gpu_count(k8s_config, "deployment_config.k8s") if gpu_count is not None: sources.append(("deployment_config.k8s.gpu_count", gpu_count)) if validate: - cls._validate_consistency(sources, model_info, deployment_config) + cls._validate_consistency( + sources, model_info, deployment_config + ) return gpu_count, "deployment_config.k8s.gpu_count" - + # Priority 3: Model definition if model_info: gpu_count = cls._extract_gpu_count(model_info, "model_info") @@ -126,55 +131,55 @@ def resolve_gpu_count( if validate: cls._validate_consistency(sources, model_info, deployment_config) return gpu_count, "model_info.n_gpus" - + # Priority 4: Default return 1, "default" - + @classmethod - def _extract_gpu_count( - cls, - config: Dict[str, Any], - context: str - ) -> Optional[int]: + def _extract_gpu_count(cls, config: Dict[str, Any], context: str) -> Optional[int]: """ Extract GPU count from config dict, trying all known field names. - + Args: config: Configuration dictionary context: Context string for warning messages - + Returns: GPU count as integer, or None if not found """ if not config: return None - + found_fields = [] for field_name in cls.GPU_FIELD_ALIASES: if field_name in config: found_fields.append((field_name, config[field_name])) - + if not found_fields: return None - + # Warn if multiple GPU fields found if len(found_fields) > 1: field_list = ", ".join([f"{name}={val}" for name, val in found_fields]) logger.warning( "Multiple GPU fields in %s: %s. Using %s=%s", - context, field_list, found_fields[0][0], found_fields[0][1], + context, + field_list, + found_fields[0][0], + found_fields[0][1], ) - + # Convert to int (handle string values like "8") try: return int(found_fields[0][1]) except (ValueError, TypeError): logger.warning( "Invalid GPU count in %s: %s. Using default.", - context, found_fields[0][1], + context, + found_fields[0][1], ) return None - + @classmethod def _validate_consistency( cls, @@ -184,9 +189,9 @@ def _validate_consistency( ) -> None: """ Validate consistency between different GPU count sources. - + Warns if there are mismatches that might indicate configuration errors. - + Args: sources: List of (source_name, gpu_count) tuples found so far model_info: Model configuration for additional checks @@ -194,50 +199,51 @@ def _validate_consistency( """ if not sources: return - + # Collect all GPU counts from all sources all_counts = {} - + # Add already resolved source for source_name, count in sources: all_counts[source_name] = count - + # Check model_info if model_info: model_gpu = cls._extract_gpu_count(model_info, "model_info") if model_gpu is not None: all_counts["model_info.n_gpus"] = model_gpu - + # Check deployment config if deployment_config: if "slurm" in deployment_config: - slurm_gpu = cls._extract_gpu_count( - deployment_config["slurm"], "slurm" - ) + slurm_gpu = cls._extract_gpu_count(deployment_config["slurm"], "slurm") if slurm_gpu is not None: all_counts["deployment_config.slurm.gpus_per_node"] = slurm_gpu - + if "k8s" in deployment_config or "kubernetes" in deployment_config: - k8s_config = deployment_config.get("k8s") or deployment_config.get("kubernetes") + k8s_config = deployment_config.get("k8s") or deployment_config.get( + "kubernetes" + ) k8s_gpu = cls._extract_gpu_count(k8s_config, "k8s") if k8s_gpu is not None: all_counts["deployment_config.k8s.gpu_count"] = k8s_gpu - + # Check for mismatches unique_counts = set(all_counts.values()) if len(unique_counts) > 1: mismatch_details = ", ".join([f"{k}={v}" for k, v in all_counts.items()]) # Determine if this is likely intentional (deployment override) or an error - is_deployment_override = ( - sources[0][0].startswith("runtime_override") or - sources[0][0].startswith("deployment_config") - ) - + is_deployment_override = sources[0][0].startswith( + "runtime_override" + ) or sources[0][0].startswith("deployment_config") + if is_deployment_override: # This is normal - deployment config overriding model default logger.info( "GPU configuration override: %s=%s (overriding model default: %s)", - sources[0][0], sources[0][1], mismatch_details.split(",")[-1].strip(), + sources[0][0], + sources[0][1], + mismatch_details.split(",")[-1].strip(), ) else: # Potentially unexpected mismatch - use warning for actual errors @@ -246,7 +252,7 @@ def _validate_consistency( f" Using: {sources[0][0]}={sources[0][1]}\n" f" Precedence: runtime_override > deployment_config > model_info > default", UserWarning, - stacklevel=4 + stacklevel=4, ) @@ -256,16 +262,16 @@ def resolve_runtime_gpus( ) -> int: """ Convenience function for resolving GPU count at runtime. - + This is the main entry point for runtime GPU resolution. - + Args: model_info: Model configuration from manifest additional_context: Additional context from CLI or config files - + Returns: Resolved GPU count as integer - + Example: >>> model_info = {"name": "my_model", "n_gpus": "1"} >>> additional_context = {"slurm": {"gpus_per_node": 8}} @@ -275,18 +281,20 @@ def resolve_runtime_gpus( """ # Extract deployment config from additional_context deployment_config = additional_context.get("deployment_config", {}) - + # Also check for direct slurm/k8s keys in additional_context if "slurm" in additional_context: if not deployment_config: deployment_config = {} deployment_config["slurm"] = additional_context["slurm"] - + if "k8s" in additional_context or "kubernetes" in additional_context: if not deployment_config: deployment_config = {} - deployment_config["k8s"] = additional_context.get("k8s") or additional_context.get("kubernetes") - + deployment_config["k8s"] = additional_context.get( + "k8s" + ) or additional_context.get("kubernetes") + # Check for direct runtime GPU override (in additional_context or deployment_config) runtime_override = None for field in GPUConfigResolver.GPU_FIELD_ALIASES: @@ -297,15 +305,14 @@ def resolve_runtime_gpus( if deployment_config and field in deployment_config: runtime_override = {field: deployment_config[field]} break - + gpu_count, source = GPUConfigResolver.resolve_gpu_count( model_info=model_info, deployment_config=deployment_config, runtime_override=runtime_override, validate=True, ) - + logger.info("Resolved GPU count: %s (from %s)", gpu_count, source) - - return gpu_count + return gpu_count diff --git a/src/madengine/utils/gpu_tool_factory.py b/src/madengine/utils/gpu_tool_factory.py index b3a0b566..300a3cbb 100644 --- a/src/madengine/utils/gpu_tool_factory.py +++ b/src/madengine/utils/gpu_tool_factory.py @@ -36,18 +36,18 @@ def get_gpu_tool_manager( Returns: GPU tool manager instance for the specified vendor - + Raises: ValueError: If vendor is unknown or unsupported ImportError: If vendor-specific manager module cannot be imported - + Example: >>> from madengine.utils.gpu_tool_factory import get_gpu_tool_manager >>> from madengine.utils.gpu_validator import GPUVendor - >>> + >>> >>> # Auto-detect vendor >>> manager = get_gpu_tool_manager() - >>> + >>> >>> # Explicit vendor >>> amd_manager = get_gpu_tool_manager(GPUVendor.AMD) >>> nvidia_manager = get_gpu_tool_manager(GPUVendor.NVIDIA) @@ -69,6 +69,7 @@ def get_gpu_tool_manager( if vendor == GPUVendor.AMD: try: from madengine.utils.rocm_tool_manager import ROCmToolManager + manager = ROCmToolManager(rocm_path=rocm_path) logger.info(f"Created new ROCm tool manager") except ImportError as e: @@ -77,21 +78,22 @@ def get_gpu_tool_manager( elif vendor == GPUVendor.NVIDIA: try: from madengine.utils.nvidia_tool_manager import NvidiaToolManager + manager = NvidiaToolManager() logger.info(f"Created new NVIDIA tool manager") except ImportError as e: raise ImportError(f"Failed to import NVIDIA tool manager: {e}") - + elif vendor == GPUVendor.UNKNOWN: raise ValueError( "Unable to detect GPU vendor. Ensure GPU drivers and tools are installed.\n" "For AMD: Install ROCm (https://github.com/ROCm/ROCm)\n" "For NVIDIA: Install CUDA toolkit" ) - + else: raise ValueError(f"Unsupported GPU vendor: {vendor.value}") - + # Cache the manager instance _manager_instances[cache_key] = manager @@ -100,18 +102,18 @@ def get_gpu_tool_manager( def clear_manager_cache() -> None: """Clear all cached manager instances. - + Useful for testing or when GPU configuration changes during runtime. This will force recreation of managers on next call to get_gpu_tool_manager(). - + Also clears internal caches within each manager before removing them. """ global _manager_instances - + # Clear caches within managers before removing them for manager in _manager_instances.values(): manager.clear_cache() - + _manager_instances.clear() logger.debug("Cleared all GPU tool manager instances") @@ -126,4 +128,3 @@ def get_cached_managers() -> Dict[tuple, BaseGPUToolManager]: Dictionary mapping (vendor, rocm_path) to manager instances """ return _manager_instances.copy() - diff --git a/src/madengine/utils/gpu_tool_manager.py b/src/madengine/utils/gpu_tool_manager.py index 701e1db7..1eff8205 100644 --- a/src/madengine/utils/gpu_tool_manager.py +++ b/src/madengine/utils/gpu_tool_manager.py @@ -20,175 +20,164 @@ class BaseGPUToolManager(ABC): """Abstract base class for GPU vendor-specific tool managers. - + Provides common infrastructure for: - Tool availability checking - Command execution with timeout - Result caching - Consistent logging - + Subclasses implement vendor-specific logic for: - Version detection - Tool selection - Command execution with fallback """ - + def __init__(self): """Initialize base GPU tool manager.""" self._cache: Dict[str, Any] = {} self._cache_lock = threading.Lock() - + @abstractmethod def get_version(self) -> Optional[str]: """Get GPU vendor tool version (e.g., ROCm version, CUDA version). - + Returns: Version string or None if unable to detect """ pass - + @abstractmethod def execute_command( - self, - command: str, - fallback_command: Optional[str] = None, - timeout: int = 30 + self, command: str, fallback_command: Optional[str] = None, timeout: int = 30 ) -> str: """Execute command with optional fallback. - + Args: command: Primary command to execute fallback_command: Optional fallback command if primary fails timeout: Command timeout in seconds - + Returns: Command output as string - + Raises: RuntimeError: If both primary and fallback commands fail """ pass - + def is_tool_available(self, tool_path: str) -> bool: """Check if a tool exists and is executable. - + Args: tool_path: Path to the tool (e.g., /opt/rocm/bin/amd-smi) - + Returns: True if tool exists and is executable, False otherwise """ cache_key = f"tool_available:{tool_path}" - + # Check cache first with self._cache_lock: if cache_key in self._cache: return self._cache[cache_key] - + # Check if file exists and is executable result = os.path.isfile(tool_path) and os.access(tool_path, os.X_OK) - + # Cache the result with self._cache_lock: self._cache[cache_key] = result - + return result - + def _execute_shell_command( - self, - command: str, - timeout: int = 30, - check_returncode: bool = True + self, command: str, timeout: int = 30, check_returncode: bool = True ) -> Tuple[bool, str, str]: """Execute a shell command and return result. - + Args: command: Shell command to execute timeout: Timeout in seconds check_returncode: If True, only succeed on returncode 0 - + Returns: Tuple of (success, stdout, stderr) """ try: result = subprocess.run( - command, - shell=True, - capture_output=True, - text=True, - timeout=timeout + command, shell=True, capture_output=True, text=True, timeout=timeout ) - + success = (result.returncode == 0) if check_returncode else True return success, result.stdout.strip(), result.stderr.strip() - + except subprocess.TimeoutExpired: return False, "", f"Command timed out after {timeout} seconds" except FileNotFoundError: return False, "", f"Command not found: {command.split()[0]}" except Exception as e: return False, "", f"Command execution error: {str(e)}" - + def _cache_result(self, key: str, value: Any) -> None: """Cache a result for future use. - + Args: key: Cache key value: Value to cache """ with self._cache_lock: self._cache[key] = value - + def _get_cached_result(self, key: str) -> Optional[Any]: """Get a cached result. - + Args: key: Cache key - + Returns: Cached value or None if not found """ with self._cache_lock: return self._cache.get(key) - + def _log_debug(self, message: str) -> None: """Log a debug message. - + Args: message: Debug message """ logger.debug(f"[{self.__class__.__name__}] {message}") - + def _log_info(self, message: str) -> None: """Log an info message. - + Args: message: Info message """ logger.info(f"[{self.__class__.__name__}] {message}") - + def _log_warning(self, message: str) -> None: """Log a warning message. - + Args: message: Warning message """ logger.warning(f"[{self.__class__.__name__}] {message}") - + def _log_error(self, message: str) -> None: """Log an error message. - + Args: message: Error message """ logger.error(f"[{self.__class__.__name__}] {message}") - + def clear_cache(self) -> None: """Clear all cached results. - + Useful for testing or when tools are installed/updated during runtime. """ with self._cache_lock: self._cache.clear() self._log_debug("Cache cleared") - diff --git a/src/madengine/utils/gpu_validator.py b/src/madengine/utils/gpu_validator.py index 8429891e..d68fdc39 100644 --- a/src/madengine/utils/gpu_validator.py +++ b/src/madengine/utils/gpu_validator.py @@ -8,17 +8,18 @@ Copyright (c) Advanced Micro Devices, Inc. All rights reserved. """ -import subprocess import os -from typing import List, Tuple, Optional +import subprocess from dataclasses import dataclass from enum import Enum +from typing import List, Optional, Tuple from madengine.core.constants import get_rocm_path class GPUVendor(Enum): """Supported GPU vendors""" + AMD = "AMD" NVIDIA = "NVIDIA" UNKNOWN = "UNKNOWN" @@ -27,13 +28,14 @@ class GPUVendor(Enum): @dataclass class GPUValidationResult: """Result of GPU validation check""" + is_valid: bool vendor: GPUVendor version: Optional[str] = None # ROCm version or CUDA version issues: List[str] = None warnings: List[str] = None suggestions: List[str] = None - + def __post_init__(self): if self.issues is None: self.issues = [] @@ -48,8 +50,8 @@ class ROCmValidator: # KFD (Kernel Fusion Driver) paths - not under ROCm install KFD_PATHS = { - 'kfd_device': '/dev/kfd', - 'kfd_topology': '/sys/devices/virtual/kfd/kfd/topology/nodes', + "kfd_device": "/dev/kfd", + "kfd_topology": "/sys/devices/virtual/kfd/kfd/topology/nodes", } def __init__(self, verbose: bool = False, rocm_path: Optional[str] = None): @@ -62,32 +64,29 @@ def __init__(self, verbose: bool = False, rocm_path: Optional[str] = None): self.verbose = verbose self.rocm_path = get_rocm_path(rocm_path) self.ESSENTIAL_PATHS = { - 'rocm_root': self.rocm_path, - 'hip_path': os.path.join(self.rocm_path, 'bin', 'hipconfig'), - 'rocminfo': os.path.join(self.rocm_path, 'bin', 'rocminfo'), + "rocm_root": self.rocm_path, + "hip_path": os.path.join(self.rocm_path, "bin", "hipconfig"), + "rocminfo": os.path.join(self.rocm_path, "bin", "rocminfo"), } self.RECOMMENDED_PATHS = { - 'amd_smi': os.path.join(self.rocm_path, 'bin', 'amd-smi'), - 'rocm_smi': os.path.join(self.rocm_path, 'bin', 'rocm-smi'), + "amd_smi": os.path.join(self.rocm_path, "bin", "amd-smi"), + "rocm_smi": os.path.join(self.rocm_path, "bin", "rocm-smi"), } self._tool_manager = None # Lazy initialization - + def _run_command(self, cmd: List[str], timeout: int = 10) -> Tuple[bool, str, str]: """Run a command and return success status and output - + Args: cmd: Command to run as list of strings timeout: Timeout in seconds - + Returns: Tuple of (success, stdout, stderr) """ try: result = subprocess.run( - cmd, - capture_output=True, - text=True, - timeout=timeout + cmd, capture_output=True, text=True, timeout=timeout ) return result.returncode == 0, result.stdout.strip(), result.stderr.strip() except subprocess.TimeoutExpired: @@ -96,33 +95,34 @@ def _run_command(self, cmd: List[str], timeout: int = 10) -> Tuple[bool, str, st return False, "", f"Command not found: {cmd[0]}" except Exception as e: return False, "", str(e) - + def _check_path_exists(self, path: str) -> bool: """Check if a path exists""" return os.path.exists(path) - + def _get_tool_manager(self): """Get or create ROCm tool manager instance - + Returns: ROCmToolManager instance """ if self._tool_manager is None: try: from madengine.utils.rocm_tool_manager import ROCmToolManager + self._tool_manager = ROCmToolManager(rocm_path=self.rocm_path) except ImportError as e: if self.verbose: print(f"Warning: Could not import ROCmToolManager: {e}") return None return self._tool_manager - + def _get_rocm_version(self) -> Optional[str]: """Get ROCm version from system using tool manager - + Returns: ROCm version string or None if not found - + Enhancement: Uses ROCmToolManager for robust multi-method version detection. """ @@ -133,30 +133,30 @@ def _get_rocm_version(self) -> Optional[str]: return tool_manager.get_version() except Exception: pass # Fallback to direct methods - + # Fallback: Try hipconfig first - success, stdout, _ = self._run_command(['hipconfig', '--version']) + success, stdout, _ = self._run_command(["hipconfig", "--version"]) if success and stdout: - return stdout.split('-')[0] # Remove build suffix - + return stdout.split("-")[0] # Remove build suffix + # Try version file - version_file = os.path.join(self.rocm_path, '.info', 'version') + version_file = os.path.join(self.rocm_path, ".info", "version") if os.path.exists(version_file): try: - with open(version_file, 'r') as f: - version = f.read().strip().split('-')[0] + with open(version_file, "r") as f: + version = f.read().strip().split("-")[0] return version except Exception: pass - + return None - + def _check_gpu_accessible(self) -> Tuple[bool, str]: """Check if GPUs are accessible using version-aware tool selection - + Returns: Tuple of (accessible, message) - + Enhancement: Uses tool manager to prefer correct tool based on ROCm version (PR #54). """ @@ -168,73 +168,80 @@ def _check_gpu_accessible(self) -> Tuple[bool, str]: if count > 0: version = tool_manager.get_rocm_version() preferred_tool = tool_manager.get_preferred_smi_tool() - return True, f"GPUs accessible via tool manager ({preferred_tool}, ROCm {version})" + return ( + True, + f"GPUs accessible via tool manager ({preferred_tool}, ROCm {version})", + ) except Exception: pass # Fall back to direct checks - + # Fallback: Try rocminfo first (most reliable for detection) - success, stdout, stderr = self._run_command(['rocminfo']) + success, stdout, stderr = self._run_command(["rocminfo"]) if success: # Check if any GPU agents are listed - if 'Agent' in stdout and 'gfx' in stdout.lower(): + if "Agent" in stdout and "gfx" in stdout.lower(): return True, "GPUs accessible via rocminfo" else: return False, "rocminfo ran but no GPU agents detected" - + # Try amd-smi - success, stdout, stderr = self._run_command(['amd-smi', 'list']) + success, stdout, stderr = self._run_command(["amd-smi", "list"]) if success and stdout: return True, "GPUs accessible via amd-smi" - + # Try rocm-smi - success, stdout, stderr = self._run_command(['rocm-smi']) + success, stdout, stderr = self._run_command(["rocm-smi"]) if success and stdout: return True, "GPUs accessible via rocm-smi" - + return False, "No GPU management tool could detect GPUs" - + def _check_kfd_driver(self) -> Tuple[bool, List[str], List[str]]: """Check if KFD driver is loaded - + Returns: Tuple of (loaded, critical_issues, warnings) """ critical_issues = [] warnings = [] - + # Check /dev/kfd - this is critical - if not self._check_path_exists('/dev/kfd'): - critical_issues.append("/dev/kfd device not found - KFD driver may not be loaded") - + if not self._check_path_exists("/dev/kfd"): + critical_issues.append( + "/dev/kfd device not found - KFD driver may not be loaded" + ) + # Check KFD topology - this is critical - if not self._check_path_exists('/sys/devices/virtual/kfd/kfd/topology/nodes'): - critical_issues.append("KFD topology not found - GPU topology may not be available") - + if not self._check_path_exists("/sys/devices/virtual/kfd/kfd/topology/nodes"): + critical_issues.append( + "KFD topology not found - GPU topology may not be available" + ) + # Check dmesg for amdgpu module - this is just a warning if other checks pass - success, stdout, _ = self._run_command(['dmesg'], timeout=5) + success, stdout, _ = self._run_command(["dmesg"], timeout=5) if success: - if 'amdgpu' not in stdout.lower(): + if "amdgpu" not in stdout.lower(): warnings.append("amdgpu driver messages not found in dmesg") - + return len(critical_issues) == 0, critical_issues, warnings - + def validate(self) -> GPUValidationResult: """Perform comprehensive ROCm validation - + Returns: GPUValidationResult with validation results """ result = GPUValidationResult(is_valid=True, vendor=GPUVendor.AMD) - + if self.verbose: print("=" * 70) print("ROCm Installation Validation") print("=" * 70) - + # 1. Check essential paths if self.verbose: print("\n[1/6] Checking essential ROCm paths...") - + for name, path in self.ESSENTIAL_PATHS.items(): if not self._check_path_exists(path): result.is_valid = False @@ -244,11 +251,11 @@ def validate(self) -> GPUValidationResult: else: if self.verbose: print(f" ✓ {name}: Found at {path}") - + # 2. Get ROCm version if self.verbose: print("\n[2/6] Detecting ROCm version...") - + version = self._get_rocm_version() if version: result.version = version @@ -259,11 +266,11 @@ def validate(self) -> GPUValidationResult: result.issues.append("Unable to detect ROCm version") if self.verbose: print(f" ✗ ROCm version: NOT DETECTED") - + # 3. Check recommended tools if self.verbose: print("\n[3/6] Checking recommended ROCm tools...") - + has_smi = False for name, path in self.RECOMMENDED_PATHS.items(): if self._check_path_exists(path): @@ -273,21 +280,21 @@ def validate(self) -> GPUValidationResult: else: if self.verbose: print(f" ⚠ {name}: NOT FOUND at {path}") - + if not has_smi: result.warnings.append("No GPU management tool (amd-smi/rocm-smi) found") result.suggestions.append("Install ROCm SMI tools for GPU monitoring") - + # 4. Check KFD driver if self.verbose: print("\n[4/6] Checking KFD driver...") - + kfd_ok, kfd_critical_issues, kfd_warnings = self._check_kfd_driver() - + # 5. Check GPU accessibility if self.verbose: print("\n[5/6] Checking GPU accessibility...") - + gpu_accessible, gpu_msg = self._check_gpu_accessible() if gpu_accessible: if self.verbose: @@ -297,7 +304,7 @@ def validate(self) -> GPUValidationResult: result.issues.append(gpu_msg) if self.verbose: print(f" ✗ {gpu_msg}") - + # Now decide how to handle KFD issues based on GPU accessibility # If GPUs are accessible, treat KFD dmesg warnings as non-critical if not kfd_ok: @@ -327,25 +334,31 @@ def validate(self) -> GPUValidationResult: print(f" ✓ KFD driver loaded") for warning in kfd_warnings: print(f" ⚠ {warning}") - + # 6. Check permissions if self.verbose: print("\n[6/6] Checking permissions...") - - if os.path.exists('/dev/kfd'): + + if os.path.exists("/dev/kfd"): try: # Try to access /dev/kfd - if os.access('/dev/kfd', os.R_OK | os.W_OK): + if os.access("/dev/kfd", os.R_OK | os.W_OK): if self.verbose: print(f" ✓ /dev/kfd is accessible") else: - result.warnings.append("Current user may not have permission to access /dev/kfd") - result.suggestions.append("Add user to 'video' or 'render' group: sudo usermod -aG video,render $USER") + result.warnings.append( + "Current user may not have permission to access /dev/kfd" + ) + result.suggestions.append( + "Add user to 'video' or 'render' group: sudo usermod -aG video,render $USER" + ) if self.verbose: - print(f" ⚠ /dev/kfd exists but may not be accessible by current user") + print( + f" ⚠ /dev/kfd exists but may not be accessible by current user" + ) except Exception as e: result.warnings.append(f"Unable to check /dev/kfd permissions: {e}") - + # Generate suggestions based on issues if result.issues: if not self._check_path_exists(self.rocm_path): @@ -354,7 +367,7 @@ def validate(self) -> GPUValidationResult: "Set ROCM_PATH if using a non-default install, or install ROCm: " "https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html" ) - + if "KFD driver" in str(result.issues): result.suggestions.append( "Load amdgpu kernel module: sudo modprobe amdgpu" @@ -362,7 +375,7 @@ def validate(self) -> GPUValidationResult: result.suggestions.append( "Reboot the system after ROCm installation to ensure kernel drivers are loaded" ) - + # Print summary if self.verbose: print("\n" + "=" * 70) @@ -372,87 +385,88 @@ def validate(self) -> GPUValidationResult: print("✓ ROCm installation is VALID") else: print("✗ ROCm installation has ISSUES") - + if result.version: print(f"\nROCm Version: {result.version}") - + if result.issues: print(f"\nIssues Found ({len(result.issues)}):") for i, issue in enumerate(result.issues, 1): print(f" {i}. {issue}") - + if result.warnings: print(f"\nWarnings ({len(result.warnings)}):") for i, warning in enumerate(result.warnings, 1): print(f" {i}. {warning}") - + if result.suggestions: print(f"\nSuggestions ({len(result.suggestions)}):") for i, suggestion in enumerate(result.suggestions, 1): print(f" {i}. {suggestion}") - + print("=" * 70) - + return result - + def get_error_message(self, result: GPUValidationResult) -> str: """Generate a detailed error message from validation result - + Args: result: ROCmValidationResult from validate() - + Returns: Formatted error message string """ if result.is_valid: return "" - + lines = ["ROCm installation validation FAILED:"] lines.append("") - + if result.issues: lines.append("Critical Issues:") for issue in result.issues: lines.append(f" - {issue}") lines.append("") - + if result.warnings: lines.append("Warnings:") for warning in result.warnings: lines.append(f" - {warning}") lines.append("") - + if result.suggestions: lines.append("Suggested Actions:") for suggestion in result.suggestions: lines.append(f" • {suggestion}") lines.append("") - - lines.append("Please ensure ROCm is properly installed before running madengine.") - lines.append("Installation guide: https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html") - + + lines.append( + "Please ensure ROCm is properly installed before running madengine." + ) + lines.append( + "Installation guide: https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html" + ) + return "\n".join(lines) class NVIDIAValidator: """Validator for NVIDIA CUDA installation""" - + def __init__(self, verbose: bool = False): """Initialize NVIDIA validator - + Args: verbose: If True, print detailed validation progress """ self.verbose = verbose - + def _run_command(self, cmd: List[str], timeout: int = 10) -> Tuple[bool, str, str]: """Run a command and return success status and output""" try: result = subprocess.run( - cmd, - capture_output=True, - text=True, - timeout=timeout + cmd, capture_output=True, text=True, timeout=timeout ) return result.returncode == 0, result.stdout.strip(), result.stderr.strip() except subprocess.TimeoutExpired: @@ -461,43 +475,46 @@ def _run_command(self, cmd: List[str], timeout: int = 10) -> Tuple[bool, str, st return False, "", f"Command not found: {cmd[0]}" except Exception as e: return False, "", str(e) - + def _get_cuda_version(self) -> Optional[str]: """Get CUDA version from nvidia-smi or nvcc""" # Try nvidia-smi first - success, stdout, _ = self._run_command(['nvidia-smi', '--query-gpu=driver_version', '--format=csv,noheader']) + success, stdout, _ = self._run_command( + ["nvidia-smi", "--query-gpu=driver_version", "--format=csv,noheader"] + ) if success and stdout: - return stdout.split('\n')[0].strip() - + return stdout.split("\n")[0].strip() + # Try nvcc as fallback - success, stdout, _ = self._run_command(['nvcc', '--version']) - if success and 'release' in stdout.lower(): + success, stdout, _ = self._run_command(["nvcc", "--version"]) + if success and "release" in stdout.lower(): # Extract version from output like "release 11.8, V11.8.89" import re - match = re.search(r'release (\d+\.\d+)', stdout) + + match = re.search(r"release (\d+\.\d+)", stdout) if match: return match.group(1) - + return None - + def validate(self) -> GPUValidationResult: """Perform NVIDIA CUDA validation - + Returns: GPUValidationResult with validation results """ result = GPUValidationResult(is_valid=True, vendor=GPUVendor.NVIDIA) - + if self.verbose: print("=" * 70) print("NVIDIA GPU (CUDA) Validation") print("=" * 70) print() - + # 1. Check nvidia-smi if self.verbose: print("[1/4] Checking nvidia-smi availability...") - + if not os.path.exists("/usr/bin/nvidia-smi"): result.is_valid = False result.issues.append("nvidia-smi not found at /usr/bin/nvidia-smi") @@ -506,12 +523,12 @@ def validate(self) -> GPUValidationResult: else: if self.verbose: print(" ✓ nvidia-smi: Found") - + # 2. Test nvidia-smi execution if self.verbose: print("\n[2/4] Testing nvidia-smi execution...") - - success, stdout, stderr = self._run_command(['nvidia-smi', '--list-gpus']) + + success, stdout, stderr = self._run_command(["nvidia-smi", "--list-gpus"]) if not success: result.is_valid = False result.issues.append(f"nvidia-smi failed to execute: {stderr}") @@ -520,11 +537,11 @@ def validate(self) -> GPUValidationResult: else: if self.verbose: print(" ✓ nvidia-smi executed successfully") - + # 3. Get CUDA version if self.verbose: print("\n[3/4] Detecting CUDA version...") - + version = self._get_cuda_version() if version: result.version = version @@ -534,25 +551,25 @@ def validate(self) -> GPUValidationResult: result.warnings.append("Unable to detect CUDA version") if self.verbose: print(" ⚠ Could not detect CUDA version") - + # 4. Count GPUs if self.verbose: print("\n[4/4] Counting available GPUs...") - - success, stdout, _ = self._run_command(['nvidia-smi', '--list-gpus']) + + success, stdout, _ = self._run_command(["nvidia-smi", "--list-gpus"]) if success and stdout: - gpu_count = len([line for line in stdout.split('\n') if line.strip()]) + gpu_count = len([line for line in stdout.split("\n") if line.strip()]) if gpu_count > 0: if self.verbose: print(f" ✓ Found {gpu_count} GPU(s)") - for line in stdout.split('\n'): + for line in stdout.split("\n"): if line.strip(): print(f" - {line.strip()}") else: result.warnings.append("No GPUs detected") if self.verbose: print(" ⚠ No GPUs detected") - + # Generate suggestions if result.issues: if "nvidia-smi not found" in str(result.issues): @@ -561,9 +578,11 @@ def validate(self) -> GPUValidationResult: "https://developer.nvidia.com/cuda-downloads" ) if "failed to execute" in str(result.issues): - result.suggestions.append("Check if NVIDIA drivers are properly loaded: lsmod | grep nvidia") + result.suggestions.append( + "Check if NVIDIA drivers are properly loaded: lsmod | grep nvidia" + ) result.suggestions.append("Try reinstalling NVIDIA drivers") - + if self.verbose: print("\n" + "=" * 70) print("NVIDIA Validation Summary") @@ -572,27 +591,27 @@ def validate(self) -> GPUValidationResult: print("✓ NVIDIA GPU installation is VALID") else: print("✗ NVIDIA GPU installation has ISSUES") - + if result.version: print(f"\nDriver/CUDA Version: {result.version}") - + if result.issues: print(f"\nIssues Found ({len(result.issues)}):") for i, issue in enumerate(result.issues, 1): print(f" {i}. {issue}") - + if result.warnings: print(f"\nWarnings ({len(result.warnings)}):") for i, warning in enumerate(result.warnings, 1): print(f" {i}. {warning}") - + if result.suggestions: print(f"\nSuggestions ({len(result.suggestions)}):") for i, suggestion in enumerate(result.suggestions, 1): print(f" {i}. {suggestion}") - + print("=" * 70) - + return result @@ -608,7 +627,9 @@ def detect_gpu_vendor(rocm_path: Optional[str] = None) -> GPUVendor: if os.path.exists("/usr/bin/nvidia-smi"): return GPUVendor.NVIDIA rocm = get_rocm_path(rocm_path) - if os.path.exists(os.path.join(rocm, "bin", "rocm-smi")) or os.path.exists(os.path.join(rocm, "bin", "amd-smi")): + if os.path.exists(os.path.join(rocm, "bin", "rocm-smi")) or os.path.exists( + os.path.join(rocm, "bin", "amd-smi") + ): return GPUVendor.AMD if os.path.exists("/usr/local/bin/amd-smi"): return GPUVendor.AMD @@ -648,7 +669,7 @@ def validate_gpu_installation( version=rocm_result.version, issues=rocm_result.issues, warnings=rocm_result.warnings, - suggestions=rocm_result.suggestions + suggestions=rocm_result.suggestions, ) elif vendor == GPUVendor.NVIDIA: validator = NVIDIAValidator(verbose=verbose) @@ -656,63 +677,67 @@ def validate_gpu_installation( else: result = GPUValidationResult(is_valid=False, vendor=GPUVendor.UNKNOWN) result.issues.append("No GPU vendor detected") - result.suggestions.append("Install NVIDIA drivers (https://developer.nvidia.com/cuda-downloads)") + result.suggestions.append( + "Install NVIDIA drivers (https://developer.nvidia.com/cuda-downloads)" + ) result.suggestions.append("Or install AMD ROCm (https://rocm.docs.amd.com)") - + if not result.is_valid and raise_on_error: raise GPUInstallationError(result) - + return result class GPUInstallationError(RuntimeError): """Exception raised when GPU installation validation fails""" - + def __init__(self, validation_result: GPUValidationResult): """Initialize with validation result - + Args: validation_result: GPUValidationResult from validation """ self.validation_result = validation_result message = self._format_error_message(validation_result) super().__init__(message) - + def _format_error_message(self, result: GPUValidationResult) -> str: """Generate a detailed error message from validation result""" if result.is_valid: return "" - + lines = [f"{result.vendor.value} GPU installation validation FAILED:"] lines.append("") - + if result.issues: lines.append("Critical Issues:") for issue in result.issues: lines.append(f" - {issue}") lines.append("") - + if result.warnings: lines.append("Warnings:") for warning in result.warnings: lines.append(f" - {warning}") lines.append("") - + if result.suggestions: lines.append("Suggested Actions:") for suggestion in result.suggestions: lines.append(f" • {suggestion}") lines.append("") - + vendor_docs = { GPUVendor.AMD: "https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html", GPUVendor.NVIDIA: "https://developer.nvidia.com/cuda-downloads", } - - lines.append(f"Please ensure {result.vendor.value} GPU drivers and tools are properly installed.") + + lines.append( + f"Please ensure {result.vendor.value} GPU drivers and tools are properly installed." + ) if result.vendor in vendor_docs: lines.append(f"Installation guide: {vendor_docs[result.vendor]}") - + return "\n".join(lines) @@ -740,20 +765,25 @@ def validate_rocm_installation( GPUInstallationError: If validation fails and raise_on_error is True """ return validate_gpu_installation( - vendor=GPUVendor.AMD, verbose=verbose, raise_on_error=raise_on_error, rocm_path=rocm_path + vendor=GPUVendor.AMD, + verbose=verbose, + raise_on_error=raise_on_error, + rocm_path=rocm_path, ) if __name__ == "__main__": # Command-line usage import sys - - verbose = '--verbose' in sys.argv or '-v' in sys.argv - result = validate_gpu_installation(vendor=None, verbose=verbose, raise_on_error=False) - + + verbose = "--verbose" in sys.argv or "-v" in sys.argv + result = validate_gpu_installation( + vendor=None, verbose=verbose, raise_on_error=False + ) + if result.is_valid: print(f"\n✓ {result.vendor.value} GPU installation validated successfully") if result.version: print(f"Version: {result.version}") - + sys.exit(0 if result.is_valid else 1) diff --git a/src/madengine/utils/log_formatting.py b/src/madengine/utils/log_formatting.py index 14a0eed5..a56ed46a 100644 --- a/src/madengine/utils/log_formatting.py +++ b/src/madengine/utils/log_formatting.py @@ -9,8 +9,8 @@ """ import pandas as pd -from rich.table import Table from rich.console import Console as RichConsole +from rich.table import Table def format_dataframe_for_log( diff --git a/src/madengine/utils/nvidia_tool_manager.py b/src/madengine/utils/nvidia_tool_manager.py index 73259b38..1cc9d9dc 100644 --- a/src/madengine/utils/nvidia_tool_manager.py +++ b/src/madengine/utils/nvidia_tool_manager.py @@ -25,38 +25,38 @@ class NvidiaToolManager(BaseGPUToolManager): """NVIDIA CUDA tool manager with basic functionality. - + Provides simple wrappers around NVIDIA tools while maintaining compatibility with BaseGPUToolManager interface. - + Current implementation: - nvidia-smi for GPU queries - nvcc for CUDA version - Basic error handling - + No version-aware tool selection yet (deferred for future work). """ - + # Tool paths NVIDIA_SMI_PATH = "/usr/bin/nvidia-smi" NVCC_PATH = "/usr/local/cuda/bin/nvcc" - + def __init__(self): """Initialize NVIDIA tool manager.""" super().__init__() self._log_debug("Initialized NVIDIA tool manager") - + def get_version(self) -> Optional[str]: """Get CUDA version as string. - + Returns: CUDA version string or None if unable to detect """ return self.get_cuda_version() - + def get_cuda_version(self) -> Optional[str]: """Get CUDA version from nvcc. - + Returns: CUDA version string (e.g., "12.0") or None if unable to detect """ @@ -64,40 +64,40 @@ def get_cuda_version(self) -> Optional[str]: cached = self._get_cached_result("cuda_version") if cached is not None: return cached - + try: # Try nvcc --version if self.is_tool_available(self.NVCC_PATH): command = f"{self.NVCC_PATH} --version | sed -n 's/^.*release \\([0-9]\\+\\.[0-9]\\+\\).*$/\\1/p'" success, stdout, stderr = self._execute_shell_command(command) - + if success and stdout: version = stdout.strip() self._cache_result("cuda_version", version) self._log_info(f"CUDA version: {version}") return version - + # Fallback: Try nvidia-smi to get driver version if self.is_tool_available(self.NVIDIA_SMI_PATH): command = f"{self.NVIDIA_SMI_PATH} --query | grep 'CUDA Version' | awk '{{print $4}}'" success, stdout, stderr = self._execute_shell_command(command) - + if success and stdout: version = stdout.strip() self._cache_result("cuda_version", version) self._log_info(f"CUDA version (from nvidia-smi): {version}") return version - + self._log_warning("Unable to detect CUDA version") return None - + except Exception as e: self._log_error(f"Error detecting CUDA version: {e}") return None - + def get_driver_version(self) -> Optional[str]: """Get NVIDIA driver version. - + Returns: Driver version string or None if unable to detect """ @@ -105,54 +105,55 @@ def get_driver_version(self) -> Optional[str]: cached = self._get_cached_result("driver_version") if cached is not None: return cached - + try: if self.is_tool_available(self.NVIDIA_SMI_PATH): command = f"{self.NVIDIA_SMI_PATH} --query-gpu=driver_version --format=csv,noheader | head -n1" success, stdout, stderr = self._execute_shell_command(command) - + if success and stdout: version = stdout.strip() self._cache_result("driver_version", version) self._log_info(f"NVIDIA driver version: {version}") return version - + self._log_warning("Unable to detect NVIDIA driver version") return None - + except Exception as e: self._log_error(f"Error detecting driver version: {e}") return None - + def execute_command( - self, - command: str, - fallback_command: Optional[str] = None, - timeout: int = 30 + self, command: str, fallback_command: Optional[str] = None, timeout: int = 30 ) -> str: """Execute command with optional fallback. - + Args: command: Primary command to execute fallback_command: Optional fallback command (currently not used for NVIDIA) timeout: Command timeout in seconds - + Returns: Command output as string - + Raises: RuntimeError: If command fails """ success, stdout, stderr = self._execute_shell_command(command, timeout) - + if success: return stdout - + # Try fallback if provided if fallback_command: - self._log_warning(f"Primary command failed, trying fallback: {fallback_command[:50]}...") - success, stdout, stderr = self._execute_shell_command(fallback_command, timeout) - + self._log_warning( + f"Primary command failed, trying fallback: {fallback_command[:50]}..." + ) + success, stdout, stderr = self._execute_shell_command( + fallback_command, timeout + ) + if success: return stdout else: @@ -164,17 +165,17 @@ def execute_command( ) else: raise RuntimeError(f"Command failed: {command}\nError: {stderr}") - + def execute_nvidia_smi(self, args: str, timeout: int = 30) -> str: """Execute nvidia-smi with specified arguments. - + Args: args: Arguments to pass to nvidia-smi timeout: Command timeout in seconds - + Returns: Command output as string - + Raises: RuntimeError: If nvidia-smi is not available or command fails """ @@ -183,20 +184,20 @@ def execute_nvidia_smi(self, args: str, timeout: int = 30) -> str: f"nvidia-smi not found at {self.NVIDIA_SMI_PATH}\n" f"Ensure NVIDIA drivers are installed." ) - + command = f"{self.NVIDIA_SMI_PATH} {args}" return self.execute_command(command, timeout=timeout) - + def execute_nvcc(self, args: str, timeout: int = 30) -> str: """Execute nvcc with specified arguments. - + Args: args: Arguments to pass to nvcc timeout: Command timeout in seconds - + Returns: Command output as string - + Raises: RuntimeError: If nvcc is not available or command fails """ @@ -205,16 +206,16 @@ def execute_nvcc(self, args: str, timeout: int = 30) -> str: f"nvcc not found at {self.NVCC_PATH}\n" f"Ensure CUDA toolkit is installed." ) - + command = f"{self.NVCC_PATH} {args}" return self.execute_command(command, timeout=timeout) - + def get_gpu_count(self) -> int: """Get number of NVIDIA GPUs in the system. - + Returns: Number of GPUs detected - + Raises: RuntimeError: If unable to detect GPUs """ @@ -222,16 +223,16 @@ def get_gpu_count(self) -> int: cached = self._get_cached_result("gpu_count") if cached is not None: return cached - + try: output = self.execute_nvidia_smi("-L | wc -l") count = int(output.strip()) - + self._cache_result("gpu_count", count) self._log_info(f"Detected {count} NVIDIA GPU(s)") - + return count - + except Exception as e: raise RuntimeError( f"Unable to determine number of NVIDIA GPUs.\n" @@ -240,16 +241,16 @@ def get_gpu_count(self) -> int: f"- Verify NVIDIA drivers: nvidia-smi\n" f"- Check GPU accessibility: ls -la /dev/nvidia*" ) - + def get_gpu_product_name(self, gpu_id: int = 0) -> str: """Get GPU product name. - + Args: gpu_id: GPU index (0-based) - + Returns: GPU product name (e.g., "NVIDIA H100 80GB HBM3") - + Raises: RuntimeError: If unable to get product name """ @@ -257,34 +258,34 @@ def get_gpu_product_name(self, gpu_id: int = 0) -> str: cached = self._get_cached_result(cache_key) if cached: return cached - + try: output = self.execute_nvidia_smi( f"--query-gpu=name --format=csv,noheader,nounits -i {gpu_id}" ) product_name = output.strip() - + self._cache_result(cache_key, product_name) self._log_debug(f"GPU {gpu_id} product name: {product_name}") - + return product_name - + except Exception as e: raise RuntimeError( f"Unable to get GPU product name for GPU {gpu_id}.\n" f"Error: {e}\n" f"Ensure GPU {gpu_id} exists: nvidia-smi -L" ) - + def get_gpu_architecture(self, gpu_id: int = 0) -> str: """Get GPU architecture/compute capability. - + Args: gpu_id: GPU index (0-based) - + Returns: GPU architecture string - + Raises: RuntimeError: If unable to detect GPU architecture """ @@ -292,22 +293,21 @@ def get_gpu_architecture(self, gpu_id: int = 0) -> str: cached = self._get_cached_result(cache_key) if cached: return cached - + try: # Get full GPU name which includes architecture info output = self.execute_nvidia_smi( f"-L | head -n{gpu_id + 1} | tail -n1 | sed 's/(UUID: .*)//g' | sed 's/GPU {gpu_id}: //g'" ) arch = output.strip() - + self._cache_result(cache_key, arch) self._log_debug(f"GPU {gpu_id} architecture: {arch}") - + return arch - + except Exception as e: raise RuntimeError( f"Unable to determine GPU architecture for GPU {gpu_id}.\n" f"Error: {e}" ) - diff --git a/src/madengine/utils/ops.py b/src/madengine/utils/ops.py index cd717fec..7c4a2890 100644 --- a/src/madengine/utils/ops.py +++ b/src/madengine/utils/ops.py @@ -9,9 +9,10 @@ Copyright (c) Advanced Micro Devices, Inc. All rights reserved. """ +import sys + # built-in modules import typing -import sys # Class to both write and display stream, in "live" mode diff --git a/src/madengine/utils/path_utils.py b/src/madengine/utils/path_utils.py index 637efb8f..584408e5 100644 --- a/src/madengine/utils/path_utils.py +++ b/src/madengine/utils/path_utils.py @@ -30,4 +30,5 @@ def get_madengine_root() -> Path: Path to the madengine package root. """ import madengine + return Path(madengine.__file__).resolve().parent diff --git a/src/madengine/utils/rocm_path_resolver.py b/src/madengine/utils/rocm_path_resolver.py index 639e3aad..3e49e444 100644 --- a/src/madengine/utils/rocm_path_resolver.py +++ b/src/madengine/utils/rocm_path_resolver.py @@ -75,9 +75,7 @@ def __init__( environ: Optional[Mapping[str, str]] = None, which: Optional[WhichFn] = None, ) -> None: - self._environ: Mapping[str, str] = ( - os.environ if environ is None else environ - ) + self._environ: Mapping[str, str] = os.environ if environ is None else environ self._which: WhichFn = which if which is not None else shutil.which @staticmethod @@ -98,11 +96,17 @@ def looks_like_rocm_root(self, root: Path) -> bool: if (root / "bin" / "rocminfo").is_file(): return True # Versioned apt/tar layouts (e.g. /opt/rocm-7.13.0) and many TheRock images - if (root / "bin" / "amd-smi").is_file() and (root / "bin" / "rocm-smi").is_file(): + if (root / "bin" / "amd-smi").is_file() and ( + root / "bin" / "rocm-smi" + ).is_file(): return True - if (root / "bin" / "rocm-smi").is_file() and (root / ".info" / "version").is_file(): + if (root / "bin" / "rocm-smi").is_file() and ( + root / ".info" / "version" + ).is_file(): return True - if (root / "bin" / "amd-smi").is_file() and (root / ".info" / "version").is_file(): + if (root / "bin" / "amd-smi").is_file() and ( + root / ".info" / "version" + ).is_file(): return True if (root / ".info" / "version").is_file(): return True @@ -120,7 +124,9 @@ def versioned_opt_rocm_dirs(self) -> List[Path]: def infer_from_path_tools(self) -> OptionalPathStr: """Use ``which`` on rocminfo, amd-smi, rocm-smi; return first plausible root.""" - from madengine.utils import rocm_path_resolver as m # same module; for patch hooks + from madengine.utils import ( + rocm_path_resolver as m, + ) # same module; for patch hooks for name in ("rocminfo", "amd-smi", "rocm-smi"): w = self._which(name) # type: ignore[operator] @@ -133,7 +139,9 @@ def infer_from_path_tools(self) -> OptionalPathStr: def auto_detect(self) -> OptionalPathStr: """Heuristic search for a usable ROCm installation (see class doc + module doc).""" - from madengine.utils import rocm_path_resolver as m # same module; for patch hooks + from madengine.utils import ( + rocm_path_resolver as m, + ) # same module; for patch hooks opt = Path("/opt/rocm") if m._looks_like_rocm_root(opt): @@ -404,9 +412,7 @@ def finalize_container_rocm_path( if oci: croot = normalize_rocm_path(oci) d["ROCM_PATH"] = croot - log( - f"ROCm container ROCM_PATH from image OCI config ({docker_image}): {croot}" - ) + log(f"ROCm container ROCM_PATH from image OCI config ({docker_image}): {croot}") return croot log( @@ -428,5 +434,3 @@ def finalize_container_rocm_path( f"{croot} (set docker_env_vars.ROCM_PATH if wrong)." ) return croot - - diff --git a/src/madengine/utils/rocm_tool_manager.py b/src/madengine/utils/rocm_tool_manager.py index 60870d29..af6d9e63 100644 --- a/src/madengine/utils/rocm_tool_manager.py +++ b/src/madengine/utils/rocm_tool_manager.py @@ -25,20 +25,19 @@ from madengine.core.constants import get_rocm_path from madengine.utils.gpu_tool_manager import BaseGPUToolManager - # ROCm version threshold for amd-smi vs rocm-smi (from PR #54) ROCM_VERSION_THRESHOLD = (6, 4, 1) class ROCmToolManager(BaseGPUToolManager): """AMD ROCm tool manager with version-aware tool selection. - + Features: - Automatic ROCm version detection from multiple sources - Version-aware tool selection (amd-smi >= 6.4.1, rocm-smi < 6.4.1) - Automatic fallback with warnings when preferred tool unavailable - Comprehensive error messages with troubleshooting suggestions - + Tool Selection Logic: - ROCm >= 6.4.1: Prefer amd-smi, fallback to rocm-smi with warning - ROCm < 6.4.1: Use rocm-smi @@ -59,10 +58,10 @@ def __init__(self, rocm_path: Optional[str] = None): self.ROCMINFO_PATH = os.path.join(self.rocm_path, "bin", "rocminfo") self.ROCM_VERSION_FILE = os.path.join(self.rocm_path, ".info", "version") self._log_debug("Initialized ROCm tool manager") - + def get_version(self) -> Optional[str]: """Get ROCm version as string. - + Returns: ROCm version string (e.g., "6.4.1") or None if unable to detect """ @@ -70,20 +69,20 @@ def get_version(self) -> Optional[str]: if version_tuple: return ".".join(map(str, version_tuple)) return None - + def get_rocm_version(self) -> Optional[Tuple[int, int, int]]: """Get ROCm version as tuple. - + Tries multiple detection methods in order: 1. hipconfig --version 2. /opt/rocm/.info/version file 3. rocminfo parsing - + Results are cached for performance. - + Returns: ROCm version as tuple (major, minor, patch) or None if unable to detect - + Example: >>> manager = ROCmToolManager() >>> manager.get_rocm_version() @@ -93,123 +92,134 @@ def get_rocm_version(self) -> Optional[Tuple[int, int, int]]: cached = self._get_cached_result("rocm_version") if cached is not None: return cached - + version = None - + # Method 1: Try hipconfig --version if self.is_tool_available(self.HIPCONFIG_PATH): success, stdout, stderr = self._execute_shell_command( - f"{self.HIPCONFIG_PATH} --version", - timeout=10 + f"{self.HIPCONFIG_PATH} --version", timeout=10 ) if success and stdout: # Parse version like "6.4.1-12345" -> (6, 4, 1) try: - version_str = stdout.split('-')[0].strip() - parts = version_str.split('.') + version_str = stdout.split("-")[0].strip() + parts = version_str.split(".") if len(parts) >= 3: version = (int(parts[0]), int(parts[1]), int(parts[2])) - self._log_debug(f"Detected ROCm version from hipconfig: {version}") + self._log_debug( + f"Detected ROCm version from hipconfig: {version}" + ) except (ValueError, IndexError) as e: - self._log_warning(f"Failed to parse hipconfig version '{stdout}': {e}") - + self._log_warning( + f"Failed to parse hipconfig version '{stdout}': {e}" + ) + # Method 2: Try version file if version is None and os.path.exists(self.ROCM_VERSION_FILE): try: - with open(self.ROCM_VERSION_FILE, 'r') as f: - version_str = f.read().strip().split('-')[0] - parts = version_str.split('.') + with open(self.ROCM_VERSION_FILE, "r") as f: + version_str = f.read().strip().split("-")[0] + parts = version_str.split(".") if len(parts) >= 3: version = (int(parts[0]), int(parts[1]), int(parts[2])) self._log_debug(f"Detected ROCm version from file: {version}") except (IOError, ValueError, IndexError) as e: self._log_warning(f"Failed to read version file: {e}") - + # Method 3: Try rocminfo (less reliable, last resort) if version is None and self.is_tool_available(self.ROCMINFO_PATH): success, stdout, stderr = self._execute_shell_command( - f"{self.ROCMINFO_PATH} | grep -i 'ROCm Version' | head -n1", - timeout=10 + f"{self.ROCMINFO_PATH} | grep -i 'ROCm Version' | head -n1", timeout=10 ) if success and stdout: try: # Parse output like "ROCm Version: 6.4.1" - match = re.search(r'(\d+)\.(\d+)\.(\d+)', stdout) + match = re.search(r"(\d+)\.(\d+)\.(\d+)", stdout) if match: - version = (int(match.group(1)), int(match.group(2)), int(match.group(3))) - self._log_debug(f"Detected ROCm version from rocminfo: {version}") + version = ( + int(match.group(1)), + int(match.group(2)), + int(match.group(3)), + ) + self._log_debug( + f"Detected ROCm version from rocminfo: {version}" + ) except (ValueError, AttributeError) as e: self._log_warning(f"Failed to parse rocminfo output: {e}") - + # Cache the result (even if None) self._cache_result("rocm_version", version) - + if version: self._log_info(f"ROCm version: {'.'.join(map(str, version))}") else: self._log_warning("Unable to detect ROCm version from any source") - + return version - + def get_preferred_smi_tool(self) -> str: """Get the preferred SMI tool based on ROCm version. - + Returns: Tool name: 'amd-smi' or 'rocm-smi' - + Logic: - ROCm >= 6.4.1: Prefer amd-smi - ROCm < 6.4.1: Use rocm-smi - Unknown version: Try amd-smi first (conservative choice) """ version = self.get_rocm_version() - + if version is None: self._log_warning("ROCm version unknown, defaulting to amd-smi") return "amd-smi" - + if version >= ROCM_VERSION_THRESHOLD: return "amd-smi" else: return "rocm-smi" - + def execute_command( - self, - command: str, - fallback_command: Optional[str] = None, - timeout: int = 30 + self, command: str, fallback_command: Optional[str] = None, timeout: int = 30 ) -> str: """Execute command with optional fallback. - + Args: command: Primary command to execute fallback_command: Optional fallback command if primary fails timeout: Command timeout in seconds - + Returns: Command output as string - + Raises: RuntimeError: If both primary and fallback commands fail """ # Try primary command success, stdout, stderr = self._execute_shell_command(command, timeout) - + if success: self._log_debug(f"Command succeeded: {command[:50]}...") return stdout - + # Capture primary error before attempting fallback (fallback overwrites stderr) primary_stderr = stderr - self._log_warning(f"Primary command failed: {command[:50]}... Error: {primary_stderr}") + self._log_warning( + f"Primary command failed: {command[:50]}... Error: {primary_stderr}" + ) # Try fallback if provided if fallback_command: self._log_info(f"Trying fallback command: {fallback_command[:50]}...") - success, stdout, stderr = self._execute_shell_command(fallback_command, timeout) + success, stdout, stderr = self._execute_shell_command( + fallback_command, timeout + ) if success: - self._log_warning("Fallback command succeeded (primary tool may be missing or misconfigured)") + self._log_warning( + "Fallback command succeeded (primary tool may be missing or misconfigured)" + ) return stdout else: # Both failed @@ -223,25 +233,27 @@ def execute_command( else: # No fallback, raise error raise RuntimeError(f"Command failed: {command}\nError: {stderr}") - - def execute_smi_command(self, command_template: str, use_amd_smi: bool = True, **kwargs) -> str: + + def execute_smi_command( + self, command_template: str, use_amd_smi: bool = True, **kwargs + ) -> str: """Execute SMI command with automatic tool selection and fallback. - + Args: command_template: Command template with {tool} placeholder use_amd_smi: If True, use amd-smi syntax; if False, use rocm-smi syntax **kwargs: Additional format parameters for command template - + Returns: Command output as string - + Example: >>> manager = ROCmToolManager() >>> # Will try amd-smi, fallback to rocm-smi if needed >>> output = manager.execute_smi_command("{tool} list --csv") """ preferred_tool = self.get_preferred_smi_tool() - + # Format command with preferred tool if preferred_tool == "amd-smi": tool_path = self.AMD_SMI_PATH @@ -249,22 +261,22 @@ def execute_smi_command(self, command_template: str, use_amd_smi: bool = True, * else: tool_path = self.ROCM_SMI_PATH fallback_path = self.AMD_SMI_PATH - + command = command_template.format(tool=tool_path, **kwargs) - + # Create fallback command if fallback tool is available fallback_command = None if self.is_tool_available(fallback_path): fallback_command = command_template.format(tool=fallback_path, **kwargs) - + return self.execute_command(command, fallback_command) - + def get_gpu_count(self) -> int: """Get number of AMD GPUs in the system. - + Returns: Number of GPUs detected - + Raises: RuntimeError: If unable to detect GPUs with any tool """ @@ -272,9 +284,9 @@ def get_gpu_count(self) -> int: cached = self._get_cached_result("gpu_count") if cached is not None: return cached - + preferred_tool = self.get_preferred_smi_tool() - + try: if preferred_tool == "amd-smi": # Try amd-smi list --csv @@ -283,17 +295,21 @@ def get_gpu_count(self) -> int: else: # Use rocm-smi command = f"{self.ROCM_SMI_PATH} --showid --csv | tail -n +2 | wc -l" - fallback = f"{self.AMD_SMI_PATH} list --csv | tail -n +3 | wc -l" if self.is_tool_available(self.AMD_SMI_PATH) else None - + fallback = ( + f"{self.AMD_SMI_PATH} list --csv | tail -n +3 | wc -l" + if self.is_tool_available(self.AMD_SMI_PATH) + else None + ) + output = self.execute_command(command, fallback) count = int(output.strip()) - + # Cache result self._cache_result("gpu_count", count) self._log_info(f"Detected {count} AMD GPU(s)") - + return count - + except Exception as e: raise RuntimeError( f"Unable to determine number of AMD GPUs.\n" @@ -304,16 +320,16 @@ def get_gpu_count(self) -> int: f"- Ensure user is in 'video' and 'render' groups\n" f"- See: https://github.com/ROCm/TheRock" ) - + def get_gpu_product_name(self, gpu_id: int = 0) -> str: """Get GPU product name with fallback (from PR #54). - + Args: gpu_id: GPU index (0-based) - + Returns: GPU product name (e.g., "AMD Instinct MI300X") - + Raises: RuntimeError: If unable to get product name with any tool """ @@ -321,9 +337,9 @@ def get_gpu_product_name(self, gpu_id: int = 0) -> str: cached = self._get_cached_result(cache_key) if cached: return cached - + preferred_tool = self.get_preferred_smi_tool() - + try: if preferred_tool == "amd-smi": # Try amd-smi static command @@ -334,17 +350,21 @@ def get_gpu_product_name(self, gpu_id: int = 0) -> str: # Use rocm-smi command = f"{self.ROCM_SMI_PATH} --showproductname | grep 'GPU\\[{gpu_id}\\]' | awk '{{print $NF}}'" # Fallback to amd-smi if available - fallback = f"{self.AMD_SMI_PATH} static -g {gpu_id} | grep MARKET_NAME: | cut -d ':' -f 2" if self.is_tool_available(self.AMD_SMI_PATH) else None - + fallback = ( + f"{self.AMD_SMI_PATH} static -g {gpu_id} | grep MARKET_NAME: | cut -d ':' -f 2" + if self.is_tool_available(self.AMD_SMI_PATH) + else None + ) + output = self.execute_command(command, fallback) product_name = output.strip() - + # Cache result self._cache_result(cache_key, product_name) self._log_debug(f"GPU {gpu_id} product name: {product_name}") - + return product_name - + except Exception as e: raise RuntimeError( f"Unable to get GPU product name for GPU {gpu_id}.\n" @@ -354,13 +374,13 @@ def get_gpu_product_name(self, gpu_id: int = 0) -> str: f"- Check ROCm version: cat {self.ROCM_VERSION_FILE}\n" f"- For ROCm >= 6.4.1, ensure amd-smi is installed" ) - + def get_gpu_architecture(self) -> str: """Get GPU architecture (e.g., gfx908, gfx90a, gfx942). - + Returns: GPU architecture string - + Raises: RuntimeError: If unable to detect GPU architecture """ @@ -368,12 +388,12 @@ def get_gpu_architecture(self) -> str: cached = self._get_cached_result("gpu_architecture") if cached: return cached - + try: # Use rocminfo to get architecture (most reliable) command = f"{self.ROCMINFO_PATH} | grep -o -m 1 'gfx.*'" success, stdout, stderr = self._execute_shell_command(command) - + if success and stdout: arch = stdout.strip() self._cache_result("gpu_architecture", arch) @@ -381,7 +401,7 @@ def get_gpu_architecture(self) -> str: return arch else: raise RuntimeError(f"rocminfo failed or returned empty: {stderr}") - + except Exception as e: raise RuntimeError( f"Unable to determine GPU architecture.\n" @@ -391,13 +411,13 @@ def get_gpu_architecture(self) -> str: f"- Check GPU is visible: {self.ROCM_SMI_PATH} --showid\n" f"- Ensure ROCm is properly installed" ) - + def get_gpu_vendor_check(self) -> str: """Check GPU vendor with fallback (from PR #54). - + Returns: "AMD" if AMD GPU detected, error message otherwise - + Note: This checks if AMD SMI tools can detect GPUs, confirming AMD vendor. """ @@ -410,46 +430,47 @@ def get_gpu_vendor_check(self) -> str: return "No AMD GPUs detected" except Exception as e: return f"Unable to detect AMD GPU vendor: {e}" - + def list_gpus_json(self) -> List[Dict]: """List all GPUs with detailed information in JSON format. - + Returns: List of GPU information dictionaries - + Raises: RuntimeError: If unable to list GPUs """ preferred_tool = self.get_preferred_smi_tool() - + try: - if preferred_tool == "amd-smi" and self.is_tool_available(self.AMD_SMI_PATH): + if preferred_tool == "amd-smi" and self.is_tool_available( + self.AMD_SMI_PATH + ): # Try amd-smi list with JSON output command = f"{self.AMD_SMI_PATH} list --json" success, stdout, stderr = self._execute_shell_command(command) - + if success and stdout: try: return json.loads(stdout) except json.JSONDecodeError as e: self._log_warning(f"Failed to parse amd-smi JSON: {e}") - + # Fallback: parse rocm-smi output command = f"{self.ROCM_SMI_PATH} --showid" output = self.execute_command(command) - + # Parse rocm-smi output to JSON-like structure gpus = [] - for line in output.split('\n'): - if 'GPU[' in line: + for line in output.split("\n"): + if "GPU[" in line: try: - gpu_id = int(line.split('[')[1].split(']')[0]) + gpu_id = int(line.split("[")[1].split("]")[0]) gpus.append({"gpu": gpu_id, "node_id": gpu_id}) except (IndexError, ValueError): continue - + return gpus - + except Exception as e: raise RuntimeError(f"Unable to list GPUs: {e}") - diff --git a/src/madengine/utils/session_tracker.py b/src/madengine/utils/session_tracker.py index 4449e496..8eeed31a 100644 --- a/src/madengine/utils/session_tracker.py +++ b/src/madengine/utils/session_tracker.py @@ -14,18 +14,18 @@ class SessionTracker: """ Tracks execution session boundaries for filtering performance results. - + When an execution starts, it records the current row count in perf.csv. After execution, results can be filtered to show only rows added during this session. - + Best Practice: Session marker file is stored in the SAME directory as perf.csv to ensure consistent access regardless of working directory changes. """ - + def __init__(self, perf_csv_path: str = "perf.csv"): """ Initialize session tracker. - + Args: perf_csv_path: Path to the performance CSV file """ @@ -33,22 +33,22 @@ def __init__(self, perf_csv_path: str = "perf.csv"): self.session_start_row: Optional[int] = None # Marker file in same directory as perf.csv self.marker_file = self.perf_csv_path.parent / ".madengine_session_start" - + def start_session(self) -> int: """ Mark the start of an execution session. - + Records the current number of rows in perf.csv so we can later identify which rows were added during this session. - + Also saves the marker file for use by child processes. - + Returns: The starting row number (number of rows in CSV before this session) """ if self.perf_csv_path.exists(): # Count existing data rows (excluding header and blank lines) - with open(self.perf_csv_path, 'r') as f: + with open(self.perf_csv_path, "r") as f: lines = f.readlines() non_empty = [l for l in lines if l.strip()] # Subtract 1 for header row @@ -56,44 +56,44 @@ def start_session(self) -> int: else: # No existing file, start at 0 self.session_start_row = 0 - + # Automatically save marker for child processes self._save_marker(self.session_start_row) - + return self.session_start_row - + def get_session_row_count(self) -> int: """ Get the number of rows added during this session. - + Returns: Number of rows added since session start """ if self.session_start_row is None: return 0 - + if not self.perf_csv_path.exists(): return 0 - - with open(self.perf_csv_path, 'r') as f: + + with open(self.perf_csv_path, "r") as f: lines = f.readlines() non_empty = [l for l in lines if l.strip()] current_row_count = max(0, len(non_empty) - 1) # Exclude header - + return current_row_count - self.session_start_row - + def _save_marker(self, start_row: int): """ Save session start marker to file (private method). - + Args: start_row: The starting row number """ # Ensure parent directory exists self.marker_file.parent.mkdir(parents=True, exist_ok=True) - with open(self.marker_file, 'w') as f: + with open(self.marker_file, "w") as f: f.write(str(start_row)) - + def cleanup_marker(self): """ Remove session marker file for this instance. @@ -103,28 +103,27 @@ def cleanup_marker(self): os.remove(self.marker_file) except OSError: pass - + @staticmethod def load_session_marker_for_csv(perf_csv_path: str = "perf.csv") -> Optional[int]: """ Static helper to load session marker for a given CSV path. - + This is useful when you don't have a SessionTracker instance but need to load the marker. - + Args: perf_csv_path: Path to the performance CSV file - + Returns: Session start row, or None if marker doesn't exist """ perf_path = Path(perf_csv_path).resolve() marker_file = perf_path.parent / ".madengine_session_start" - + if marker_file.exists(): try: - with open(marker_file, 'r') as f: + with open(marker_file, "r") as f: return int(f.read().strip()) except (ValueError, IOError): return None return None - diff --git a/tests/conftest.py b/tests/conftest.py index 91241e01..f7c0d937 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -17,13 +17,14 @@ if _SRC.is_dir() and str(_SRC) not in sys.path: sys.path.insert(0, str(_SRC)) from unittest.mock import MagicMock, patch -import pytest +import pytest # ============================================================================ # Platform Configuration Fixtures # ============================================================================ + @pytest.fixture def amd_gpu_context(): """Mock Context for AMD GPU platform (ROCm).""" @@ -95,6 +96,7 @@ def multi_platform_context(request, amd_gpu_context, nvidia_gpu_context, cpu_con # Mock Args Fixtures # ============================================================================ + @pytest.fixture def mock_build_args(): """Mock args for build command.""" @@ -149,6 +151,7 @@ def mock_run_args(): # Test Data Fixtures # ============================================================================ + @pytest.fixture def sample_models(): """Sample model data for testing.""" @@ -273,17 +276,16 @@ def sample_manifest(): # Temporary File Fixtures # ============================================================================ + @pytest.fixture def temp_manifest_file(sample_manifest): """Create a temporary manifest file.""" - with tempfile.NamedTemporaryFile( - mode="w", suffix=".json", delete=False - ) as f: + with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f: json.dump(sample_manifest, f) manifest_path = f.name - + yield manifest_path - + # Cleanup if os.path.exists(manifest_path): os.unlink(manifest_path) @@ -295,9 +297,9 @@ def temp_working_dir(): with tempfile.TemporaryDirectory() as tmpdir: original_cwd = os.getcwd() os.chdir(tmpdir) - + yield tmpdir - + os.chdir(original_cwd) @@ -305,6 +307,7 @@ def temp_working_dir(): # Mock Builder and Runner Fixtures # ============================================================================ + @pytest.fixture def mock_docker_builder(sample_build_summary_success): """Mock DockerBuilder with successful builds.""" @@ -349,6 +352,7 @@ def mock_container_runner(): # Integration Test Helpers # ============================================================================ + @pytest.fixture def integration_test_env(): """Setup integration test environment variables.""" @@ -356,7 +360,7 @@ def integration_test_env(): "MODEL_DIR": "tests/fixtures/dummy", "MAD_SKIP_GPU_CHECK": "1", # Skip actual GPU detection in tests } - + with patch.dict(os.environ, env_vars, clear=False): yield env_vars @@ -365,26 +369,17 @@ def integration_test_env(): # Pytest Configuration # ============================================================================ + def pytest_configure(config): """Configure pytest with custom markers.""" config.addinivalue_line( "markers", "integration: marks tests as integration tests (may be slow)" ) - config.addinivalue_line( - "markers", "unit: marks tests as fast unit tests" - ) - config.addinivalue_line( - "markers", "gpu: marks tests that require GPU hardware" - ) - config.addinivalue_line( - "markers", "amd: marks tests specific to AMD GPUs" - ) - config.addinivalue_line( - "markers", "nvidia: marks tests specific to NVIDIA GPUs" - ) - config.addinivalue_line( - "markers", "cpu: marks tests for CPU-only execution" - ) + config.addinivalue_line("markers", "unit: marks tests as fast unit tests") + config.addinivalue_line("markers", "gpu: marks tests that require GPU hardware") + config.addinivalue_line("markers", "amd: marks tests specific to AMD GPUs") + config.addinivalue_line("markers", "nvidia: marks tests specific to NVIDIA GPUs") + config.addinivalue_line("markers", "cpu: marks tests for CPU-only execution") config.addinivalue_line( "markers", "slow: marks tests as slow (deselect with '-m \"not slow\"')" ) @@ -394,40 +389,42 @@ def pytest_configure(config): # Utility Functions for Tests # ============================================================================ + def assert_build_manifest_valid(manifest_path): """Assert that a build manifest file is valid.""" assert os.path.exists(manifest_path), f"Manifest not found: {manifest_path}" - + with open(manifest_path) as f: manifest = json.load(f) - + # Check required keys assert "built_images" in manifest assert "built_models" in manifest assert "summary" in manifest - + # Check summary structure summary = manifest["summary"] assert "successful_builds" in summary assert "failed_builds" in summary assert isinstance(summary["successful_builds"], list) assert isinstance(summary["failed_builds"], list) - + return manifest def assert_perf_csv_valid(csv_path): """Assert that a performance CSV file is valid.""" assert os.path.exists(csv_path), f"Performance CSV not found: {csv_path}" - + import pandas as pd + df = pd.read_csv(csv_path) - + # Check required columns required_columns = ["model", "n_gpus", "gpu_architecture", "status"] for col in required_columns: assert col in df.columns, f"Missing column: {col}" - + return df @@ -436,4 +433,3 @@ def assert_perf_csv_valid(csv_path): "assert_build_manifest_valid", "assert_perf_csv_valid", ] - diff --git a/tests/e2e/test_build_workflows.py b/tests/e2e/test_build_workflows.py index 9cc74438..179762f7 100644 --- a/tests/e2e/test_build_workflows.py +++ b/tests/e2e/test_build_workflows.py @@ -10,25 +10,30 @@ Copyright (c) Advanced Micro Devices, Inc. All rights reserved. """ -# built-in modules -import os import csv import json -import pandas as pd -# 3rd party modules -import pytest +# built-in modules +import os # project modules import shutil -from tests.fixtures.utils import BASE_DIR, MODEL_DIR -from tests.fixtures.utils import global_data -from tests.fixtures.utils import clean_test_temp_files -from tests.fixtures.utils import DEFAULT_CLEAN_FILES -from tests.fixtures.utils import generate_additional_context_for_machine -from tests.fixtures.utils import get_gpu_arch -from tests.fixtures.utils import requires_gpu +import pandas as pd + +# 3rd party modules +import pytest + +from tests.fixtures.utils import ( + BASE_DIR, + DEFAULT_CLEAN_FILES, + MODEL_DIR, + clean_test_temp_files, + generate_additional_context_for_machine, + get_gpu_arch, + global_data, + requires_gpu, +) @pytest.fixture @@ -58,11 +63,11 @@ def dynamic_skip_gpu_arch_model_dir(tmp_path): return str(temp_model_dir) - # ============================================================================ # Build CLI Features Tests # ============================================================================ + class TestCLIFeatures: """Test various CLI features and command-line argument behaviors.""" @@ -99,7 +104,9 @@ def test_output_commandline_argument_writes_csv_correctly( if not success: pytest.fail("model, dummy, not found in perf_test.csv.") - @requires_gpu("skip_gpu_arch filtering requires GPU hardware to detect current architecture") + @requires_gpu( + "skip_gpu_arch filtering requires GPU hardware to detect current architecture" + ) @pytest.mark.parametrize( "clean_test_temp_files", [["perf_test.csv", "perf_test.html"]], indirect=True ) @@ -124,7 +131,9 @@ def test_commandline_argument_skip_gpu_arch( if "Skipping model" not in output: pytest.fail("Enable skipping gpu arch for running model is failed.") - @requires_gpu("skip_gpu_arch filtering requires GPU hardware to detect current architecture") + @requires_gpu( + "skip_gpu_arch filtering requires GPU hardware to detect current architecture" + ) @pytest.mark.parametrize( "clean_test_temp_files", [["perf_test.csv", "perf_test.html"]], indirect=True ) @@ -158,41 +167,46 @@ def test_output_multi_results(self, global_data, clean_test_temp_files): UPDATED: Now uses python3 -m madengine.cli.app instead of legacy mad.py """ context = generate_additional_context_for_machine() - output = global_data['console'].sh( - "cd " + BASE_DIR + "; " + - "MODEL_DIR=" + MODEL_DIR + " " + - f"python3 -m madengine.cli.app run --tags dummy_multi --live-output --additional-context '{json.dumps(context)}'" + output = global_data["console"].sh( + "cd " + + BASE_DIR + + "; " + + "MODEL_DIR=" + + MODEL_DIR + + " " + + f"python3 -m madengine.cli.app run --tags dummy_multi --live-output --additional-context '{json.dumps(context)}'" ) # Check if multiple results are written to perf_dummy.csv success = False # Read the csv file to a dataframe using pandas - multi_df = pd.read_csv(os.path.join(BASE_DIR, 'perf_dummy.csv')) + multi_df = pd.read_csv(os.path.join(BASE_DIR, "perf_dummy.csv")) # Check the number of rows in the dataframe is 4, and columns is 4 if multi_df.shape == (4, 4): success = True if not success: pytest.fail("The generated multi results is not correct.") # Check if multiple results from perf_dummy.csv get copied over to perf.csv - perf_df = pd.read_csv(os.path.join(BASE_DIR, 'perf.csv')) + perf_df = pd.read_csv(os.path.join(BASE_DIR, "perf.csv")) # Get the corresponding rows and columns from perf.csv perf_df = perf_df[multi_df.columns] perf_df = perf_df.iloc[-4:, :] # Drop model columns from both dataframes; these will not match # if multiple results csv has {model}, then perf csv has {tag_name}_{model} - multi_df = multi_df.drop('model', axis=1) - perf_df = perf_df.drop('model', axis=1) + multi_df = multi_df.drop("model", axis=1) + perf_df = perf_df.drop("model", axis=1) if all(perf_df.columns == multi_df.columns): success = True if not success: - pytest.fail("The columns of the generated multi results do not match perf.csv.") - - + pytest.fail( + "The columns of the generated multi results do not match perf.csv." + ) # ============================================================================ # Model Discovery Tests # ============================================================================ + class TestDiscover: """Test the model discovery feature.""" @@ -316,5 +330,3 @@ def test_multiple(self, global_data, clean_test_temp_files): success = True if not success: pytest.fail("multiple tags did not run successfully.") - - diff --git a/tests/e2e/test_data_workflows.py b/tests/e2e/test_data_workflows.py index b83232d5..e1f3f7e2 100644 --- a/tests/e2e/test_data_workflows.py +++ b/tests/e2e/test_data_workflows.py @@ -3,23 +3,28 @@ Copyright (c) Advanced Micro Devices, Inc. All rights reserved. """ +import csv +import json + # built-in modules import os -import csv import re -import json import tempfile # third-party modules import pytest -# project modules -from tests.fixtures.utils import BASE_DIR, MODEL_DIR -from tests.fixtures.utils import global_data -from tests.fixtures.utils import clean_test_temp_files -from tests.fixtures.utils import DEFAULT_CLEAN_FILES from madengine.core.dataprovider import Data +# project modules +from tests.fixtures.utils import ( + BASE_DIR, + DEFAULT_CLEAN_FILES, + MODEL_DIR, + clean_test_temp_files, + global_data, +) + class TestDataProviders: diff --git a/tests/e2e/test_execution_features.py b/tests/e2e/test_execution_features.py index 4d7fd601..1399637d 100644 --- a/tests/e2e/test_execution_features.py +++ b/tests/e2e/test_execution_features.py @@ -22,11 +22,11 @@ ) - # ============================================================================ # Timeout Feature Tests # ============================================================================ + class TestCustomTimeoutsFunctionality: @pytest.mark.parametrize( @@ -42,7 +42,13 @@ class TestCustomTimeoutsFunctionality: ], ) def test_timeout_value_in_log( - self, global_data, clean_test_temp_files, tags, log_base_name, expected_seconds, extra_args + self, + global_data, + clean_test_temp_files, + tags, + log_base_name, + expected_seconds, + extra_args, ): """ Timeout is set as expected (default 2h, model override, CLI override). @@ -113,11 +119,11 @@ def test_timeout_in_model_timesout_correctly( assert test_duration == pytest.approx(120, 20) - # ============================================================================ # Debugging Feature Tests # ============================================================================ + class TestDebuggingFunctionality: """""" @@ -284,10 +290,12 @@ def test_no_keepModelDir_does_not_keep_model_dir( "model directory left over after not specifying keep-model-dir (or keep-alive) argument." ) + # ============================================================================ # Live Output Feature Tests # ============================================================================ + class TestLiveOutputFunctionality: """Test the live output functionality.""" @@ -344,5 +352,3 @@ def test_liveOutput_prints_output_to_screen( if "ARG BASE_DOCKER=" not in output: pytest.fail("default run is silent") - - diff --git a/tests/e2e/test_profiling_workflows.py b/tests/e2e/test_profiling_workflows.py index 74925ae2..3f06af91 100644 --- a/tests/e2e/test_profiling_workflows.py +++ b/tests/e2e/test_profiling_workflows.py @@ -15,13 +15,13 @@ # project modules from tests.fixtures.utils import ( BASE_DIR, - MODEL_DIR, DEFAULT_CLEAN_FILES, - global_data, + MODEL_DIR, clean_test_temp_files, - requires_gpu, - is_nvidia, generate_additional_context_for_machine, + global_data, + is_nvidia, + requires_gpu, ) @@ -47,14 +47,14 @@ def test_rocprof_profiling_tool_runs_correctly( + "MODEL_DIR=" + MODEL_DIR + " " - + "python3 -m madengine.cli.app run --live-output --tags dummy_prof --additional-context '{\"gpu_vendor\": \"AMD\", \"guest_os\": \"UBUNTU\", \"tools\": [{\"name\": \"rocprof\"}]}' ", + + 'python3 -m madengine.cli.app run --live-output --tags dummy_prof --additional-context \'{"gpu_vendor": "AMD", "guest_os": "UBUNTU", "tools": [{"name": "rocprof"}]}\' ', canFail=True, ) # Check for both legacy rocprof (results.csv) and rocprofv3 (.db files) output rocprof_output_dir = os.path.join(BASE_DIR, "rocprof_output") legacy_output = os.path.join(rocprof_output_dir, "results.csv") - + # Check for rocprofv3 .db files in subdirectories rocprofv3_output_found = False if os.path.exists(rocprof_output_dir): @@ -65,7 +65,7 @@ def test_rocprof_profiling_tool_runs_correctly( break if rocprofv3_output_found: break - + if not os.path.exists(legacy_output) and not rocprofv3_output_found: pytest.fail( "Neither rocprof_output/results.csv (legacy) nor *_results.db (rocprofv3) generated with rocprof profiling run." @@ -92,7 +92,7 @@ def test_rocm_trace_lite_profiling_tool_runs_correctly( + "MODEL_DIR=" + MODEL_DIR + " " - + "python3 -m madengine.cli.app run --live-output --tags dummy_prof --additional-context '{\"gpu_vendor\": \"AMD\", \"guest_os\": \"UBUNTU\", \"tools\": [{\"name\": \"rocm_trace_lite\"}]}' ", + + 'python3 -m madengine.cli.app run --live-output --tags dummy_prof --additional-context \'{"gpu_vendor": "AMD", "guest_os": "UBUNTU", "tools": [{"name": "rocm_trace_lite"}]}\' ', canFail=True, ) @@ -122,7 +122,7 @@ def test_rpd_profiling_tool_runs_correctly( + "MODEL_DIR=" + MODEL_DIR + " " - + "python3 -m madengine.cli.app run --live-output --tags dummy_prof --additional-context '{\"gpu_vendor\": \"AMD\", \"guest_os\": \"UBUNTU\", \"tools\": [{\"name\": \"rpd\"}]}' ", + + 'python3 -m madengine.cli.app run --live-output --tags dummy_prof --additional-context \'{"gpu_vendor": "AMD", "guest_os": "UBUNTU", "tools": [{"name": "rpd"}]}\' ', canFail=True, ) @@ -148,7 +148,7 @@ def test_gpu_info_power_profiling_tool_runs_correctly( + "MODEL_DIR=" + MODEL_DIR + " " - + "python3 -m madengine.cli.app run --live-output --tags dummy_prof --additional-context '{\"gpu_vendor\": \"AMD\", \"guest_os\": \"UBUNTU\", \"tools\": [{\"name\": \"gpu_info_power_profiler\"}]}' ", + + 'python3 -m madengine.cli.app run --live-output --tags dummy_prof --additional-context \'{"gpu_vendor": "AMD", "guest_os": "UBUNTU", "tools": [{"name": "gpu_info_power_profiler"}]}\' ', canFail=False, ) @@ -178,7 +178,7 @@ def test_gpu_info_vram_profiling_tool_runs_correctly( + "MODEL_DIR=" + MODEL_DIR + " " - + "python3 -m madengine.cli.app run --live-output --tags dummy_prof --additional-context '{\"gpu_vendor\": \"AMD\", \"guest_os\": \"UBUNTU\", \"tools\": [{\"name\": \"gpu_info_vram_profiler\"}]}' ", + + 'python3 -m madengine.cli.app run --live-output --tags dummy_prof --additional-context \'{"gpu_vendor": "AMD", "guest_os": "UBUNTU", "tools": [{"name": "gpu_info_vram_profiler"}]}\' ', canFail=False, ) @@ -206,7 +206,7 @@ def test_rocblas_trace_runs_correctly(self, global_data, clean_test_temp_files): + "MODEL_DIR=" + MODEL_DIR + " " - + "python3 -m madengine.cli.app run --live-output --tags dummy_prof --additional-context '{\"gpu_vendor\": \"AMD\", \"guest_os\": \"UBUNTU\", \"tools\": [{\"name\": \"rocblas_trace\"}]}' ", + + 'python3 -m madengine.cli.app run --live-output --tags dummy_prof --additional-context \'{"gpu_vendor": "AMD", "guest_os": "UBUNTU", "tools": [{"name": "rocblas_trace"}]}\' ', canFail=False, ) @@ -242,7 +242,7 @@ def test_tensile_trace_runs_correctly(self, global_data, clean_test_temp_files): + "MODEL_DIR=" + MODEL_DIR + " " - + "python3 -m madengine.cli.app run --live-output --tags dummy_prof --additional-context '{\"gpu_vendor\": \"AMD\", \"guest_os\": \"UBUNTU\", \"tools\": [{\"name\": \"tensile_trace\"}]}' ", + + 'python3 -m madengine.cli.app run --live-output --tags dummy_prof --additional-context \'{"gpu_vendor": "AMD", "guest_os": "UBUNTU", "tools": [{"name": "tensile_trace"}]}\' ', canFail=True, # Allow failure due to missing performance metrics (trace tools suppress performance output) ) @@ -278,7 +278,7 @@ def test_miopen_trace_runs_correctly(self, global_data, clean_test_temp_files): + "MODEL_DIR=" + MODEL_DIR + " " - + "python3 -m madengine.cli.app run --live-output --tags dummy_prof --additional-context '{\"gpu_vendor\": \"AMD\", \"guest_os\": \"UBUNTU\", \"tools\": [{\"name\": \"miopen_trace\"}]}' ", + + 'python3 -m madengine.cli.app run --live-output --tags dummy_prof --additional-context \'{"gpu_vendor": "AMD", "guest_os": "UBUNTU", "tools": [{"name": "miopen_trace"}]}\' ', canFail=False, ) @@ -312,7 +312,7 @@ def test_rccl_trace_runs_correctly(self, global_data, clean_test_temp_files): + "MODEL_DIR=" + MODEL_DIR + " " - + "python3 -m madengine.cli.app run --live-output --tags dummy_prof_rccl --additional-context '{\"gpu_vendor\": \"AMD\", \"guest_os\": \"UBUNTU\", \"tools\": [{\"name\": \"rccl_trace\"}]}' ", + + 'python3 -m madengine.cli.app run --live-output --tags dummy_prof_rccl --additional-context \'{"gpu_vendor": "AMD", "guest_os": "UBUNTU", "tools": [{"name": "rccl_trace"}]}\' ', canFail=False, ) @@ -353,7 +353,7 @@ def test_toolA_runs_correctly(self, global_data, clean_test_temp_files): + "MODEL_DIR=" + MODEL_DIR + " " - + "python3 -m madengine.cli.app run --live-output --tags dummy --additional-context '{\"gpu_vendor\": \"AMD\", \"guest_os\": \"UBUNTU\", \"tools\": [{\"name\": \"test_tools_A\"}]}' ", + + 'python3 -m madengine.cli.app run --live-output --tags dummy --additional-context \'{"gpu_vendor": "AMD", "guest_os": "UBUNTU", "tools": [{"name": "test_tools_A"}]}\' ', canFail=False, ) @@ -399,7 +399,7 @@ def test_stackable_design_runs_correctly(self, global_data, clean_test_temp_file + "MODEL_DIR=" + MODEL_DIR + " " - + "python3 -m madengine.cli.app run --live-output --tags dummy --additional-context '{\"gpu_vendor\": \"AMD\", \"guest_os\": \"UBUNTU\", \"tools\": [{\"name\": \"test_tools_A\"}, {\"name\": \"test_tools_B\"}]}' ", + + 'python3 -m madengine.cli.app run --live-output --tags dummy --additional-context \'{"gpu_vendor": "AMD", "guest_os": "UBUNTU", "tools": [{"name": "test_tools_A"}, {"name": "test_tools_B"}]}\' ', canFail=False, ) @@ -462,13 +462,13 @@ def test_can_change_default_behavior_of_profiling_tool_with_additionalContext( + "MODEL_DIR=" + MODEL_DIR + " " - + "python3 -m madengine.cli.app run --live-output --tags dummy_prof --additional-context '{\"gpu_vendor\": \"AMD\", \"guest_os\": \"UBUNTU\", \"tools\": [{\"name\": \"rocprof\", \"cmd\": \"bash ../scripts/common/tools/rocprof_wrapper.sh --sys-trace --\"}]}' ", + + 'python3 -m madengine.cli.app run --live-output --tags dummy_prof --additional-context \'{"gpu_vendor": "AMD", "guest_os": "UBUNTU", "tools": [{"name": "rocprof", "cmd": "bash ../scripts/common/tools/rocprof_wrapper.sh --sys-trace --"}]}\' ', canFail=True, ) # Check for profiling output (either legacy or rocprofv3 format) rocprof_output_dir = os.path.join(BASE_DIR, "rocprof_output") - + # For rocprofv3 with --sys-trace, check for .db files rocprofv3_output_found = False if os.path.exists(rocprof_output_dir): @@ -479,10 +479,12 @@ def test_can_change_default_behavior_of_profiling_tool_with_additionalContext( break if rocprofv3_output_found: break - + # Legacy check for results files - legacy_output = os.path.exists(os.path.join(BASE_DIR, "rocprof_output", "results.csv")) - + legacy_output = os.path.exists( + os.path.join(BASE_DIR, "rocprof_output", "results.csv") + ) + if not legacy_output and not rocprofv3_output_found: pytest.fail( "No profiling output generated with custom rocprof command override." diff --git a/tests/e2e/test_run_workflows.py b/tests/e2e/test_run_workflows.py index 32f6141b..a5e09112 100644 --- a/tests/e2e/test_run_workflows.py +++ b/tests/e2e/test_run_workflows.py @@ -3,36 +3,38 @@ Copyright (c) Advanced Micro Devices, Inc. All rights reserved. """ +import csv +import json + # built-in modules import os -import csv # third-party modules import pytest -import json + +from madengine.core.context import Context # project modules -from tests.fixtures.utils import BASE_DIR, MODEL_DIR -from tests.fixtures.utils import global_data -from tests.fixtures.utils import clean_test_temp_files -from tests.fixtures.utils import get_gpu_nodeid_map -from tests.fixtures.utils import get_num_gpus -from tests.fixtures.utils import get_num_cpus -from tests.fixtures.utils import requires_gpu -from tests.fixtures.utils import generate_additional_context_for_machine from tests.fixtures.utils import ( + BASE_DIR, DEFAULT_CLEAN_FILES, - build_run_command, + MODEL_DIR, assert_model_in_perf_csv, + build_run_command, + clean_test_temp_files, + generate_additional_context_for_machine, + get_gpu_nodeid_map, + get_num_cpus, + get_num_gpus, + global_data, + requires_gpu, ) -from madengine.core.context import Context - - # ============================================================================ # Context Handling Tests # ============================================================================ + class TestContexts: @pytest.mark.parametrize( @@ -331,9 +333,7 @@ def test_docker_mounts_mount_host_paths_in_docker_container( ) @requires_gpu("docker gpus requires GPU hardware") - @pytest.mark.skipif( - get_num_gpus() < 8, reason="test requires atleast 8 gpus" - ) + @pytest.mark.skipif(get_num_gpus() < 8, reason="test requires atleast 8 gpus") @pytest.mark.parametrize( "clean_test_temp_files", [["perf.csv", "perf.html", "results_dummy_gpubind.csv"]], @@ -364,24 +364,24 @@ def test_docker_gpus(self, global_data, clean_test_temp_files): gpu_node_ids.append(row["performance"]) else: pytest.fail("model in perf_test.csv did not run successfully.") - + # Debug information print(f"GPU node IDs from performance: {gpu_node_ids}") print(f"GPU nodeid map: {gpu_nodeid_map}") mapped_gpus = [gpu_nodeid_map.get(node_id) for node_id in gpu_node_ids] print(f"Mapped GPUs: {mapped_gpus}") - + # Filter out None values and sort valid_mapped_gpus = [gpu for gpu in mapped_gpus if gpu is not None] sorted_gpus = sorted(valid_mapped_gpus) print(f"Sorted valid GPUs: {sorted_gpus}") - + if sorted_gpus != [0, 2, 3, 4, 5, 7]: - pytest.fail(f"docker_gpus did not bind expected gpus in docker container. Expected: [0, 2, 3, 4, 5, 7], Got: {sorted_gpus}, Raw node IDs: {gpu_node_ids}, Mapping: {gpu_nodeid_map}") + pytest.fail( + f"docker_gpus did not bind expected gpus in docker container. Expected: [0, 2, 3, 4, 5, 7], Got: {sorted_gpus}, Raw node IDs: {gpu_node_ids}, Mapping: {gpu_nodeid_map}" + ) - @pytest.mark.skipif( - get_num_cpus() < 64, reason="test requires atleast 64 cpus" - ) + @pytest.mark.skipif(get_num_cpus() < 64, reason="test requires atleast 64 cpus") @pytest.mark.parametrize( "clean_test_temp_files", [["perf.csv", "perf.html", "results_dummy_cpubind.csv"]], @@ -425,25 +425,25 @@ def test_gpu_product_name_matches_arch(self): """ context = Context() - product_name = context.ctx['docker_env_vars']["MAD_SYSTEM_GPU_PRODUCT_NAME"] + product_name = context.ctx["docker_env_vars"]["MAD_SYSTEM_GPU_PRODUCT_NAME"] - #fail the test if GPU product name is empty + # fail the test if GPU product name is empty if not product_name or not product_name.strip(): pytest.fail("GPU product name is empty or just whitespaces") product_name = product_name.upper() - #if product name has AMD or NVIDIA in it then it's a safe bet - #that it was parsed properly + # if product name has AMD or NVIDIA in it then it's a safe bet + # that it was parsed properly if not ("AMD" in product_name or "NVIDIA" in product_name): pytest.fail(f"Incorrect product name={product_name!r}") - # ============================================================================ # Tag Filtering Tests # ============================================================================ + class TestTagsFunctionality: @pytest.mark.parametrize( @@ -463,7 +463,7 @@ def test_can_select_model_subset_with_commandline_tag_argument( + "MODEL_DIR=" + MODEL_DIR + " " - + f"python3 -m madengine.cli.app run --tags dummy_group_1 --live-output --additional-context '{json.dumps(context)}'" + + f"python3 -m madengine.cli.app run --tags dummy_group_1 --live-output --additional-context '{json.dumps(context)}'" ) # Check for model execution (handles ANSI codes in output) @@ -520,11 +520,9 @@ def test_model_names_are_automatically_tags( + "MODEL_DIR=" + MODEL_DIR + " " - + f"python3 -m madengine.cli.app run --tags dummy --live-output --additional-context '{json.dumps(context)}'" + + f"python3 -m madengine.cli.app run --tags dummy --live-output --additional-context '{json.dumps(context)}'" ) # Check for model execution (handles ANSI codes in output) if "dummy" not in output or "ci-dummy_dummy" not in output: pytest.fail("dummy tag not selected with commandline --tags argument") - - diff --git a/tests/e2e/test_scripting_workflows.py b/tests/e2e/test_scripting_workflows.py index 3c163f1a..f1c45e3d 100644 --- a/tests/e2e/test_scripting_workflows.py +++ b/tests/e2e/test_scripting_workflows.py @@ -11,11 +11,14 @@ import pytest # project modules -from tests.fixtures.utils import BASE_DIR, MODEL_DIR -from tests.fixtures.utils import global_data -from tests.fixtures.utils import clean_test_temp_files -from tests.fixtures.utils import DEFAULT_CLEAN_FILES -from tests.fixtures.utils import is_nvidia +from tests.fixtures.utils import ( + BASE_DIR, + DEFAULT_CLEAN_FILES, + MODEL_DIR, + clean_test_temp_files, + global_data, + is_nvidia, +) class TestPrePostScriptsFunctionality: diff --git a/tests/fixtures/configs/test_slurm_job.yaml b/tests/fixtures/configs/test_slurm_job.yaml new file mode 100644 index 00000000..97342a7f --- /dev/null +++ b/tests/fixtures/configs/test_slurm_job.yaml @@ -0,0 +1,17 @@ +model: + tags: [dummy] + +slurm: + partition: test-partition + nodes: 2 + +distributed: + enabled: true + launcher: torchrun + nnodes: 2 + nproc_per_node: 4 + +env_vars: + MY_VAR: test_value + +debug: true diff --git a/tests/fixtures/dummy/data.json b/tests/fixtures/dummy/data.json index 2c76f3df..5ca41a83 100644 --- a/tests/fixtures/dummy/data.json +++ b/tests/fixtures/dummy/data.json @@ -9,4 +9,4 @@ "path": "/tmp/nonexistent" } } -} \ No newline at end of file +} diff --git a/tests/fixtures/dummy/docker/dummy_sglang.ubuntu.amd.Dockerfile b/tests/fixtures/dummy/docker/dummy_sglang.ubuntu.amd.Dockerfile index f45e5bc3..33086f8e 100644 --- a/tests/fixtures/dummy/docker/dummy_sglang.ubuntu.amd.Dockerfile +++ b/tests/fixtures/dummy/docker/dummy_sglang.ubuntu.amd.Dockerfile @@ -130,4 +130,3 @@ RUN echo "====================================================================== echo "" && \ echo "🚀 Ready for distributed LLM inference on AMD GPUs!" && \ echo "" - diff --git a/tests/fixtures/dummy/docker/dummy_sglang_disagg.ubuntu.amd.Dockerfile b/tests/fixtures/dummy/docker/dummy_sglang_disagg.ubuntu.amd.Dockerfile index 43d04337..67e4f5cb 100644 --- a/tests/fixtures/dummy/docker/dummy_sglang_disagg.ubuntu.amd.Dockerfile +++ b/tests/fixtures/dummy/docker/dummy_sglang_disagg.ubuntu.amd.Dockerfile @@ -183,4 +183,3 @@ RUN echo "====================================================================== echo " Note: This is a dummy/test image for madengine validation" && \ echo " For production: Use full Mooncake with RDMA support" && \ echo "" - diff --git a/tests/fixtures/dummy/docker/dummy_therock.ubuntu.amd.Dockerfile b/tests/fixtures/dummy/docker/dummy_therock.ubuntu.amd.Dockerfile index 16dda670..47fb3644 100644 --- a/tests/fixtures/dummy/docker/dummy_therock.ubuntu.amd.Dockerfile +++ b/tests/fixtures/dummy/docker/dummy_therock.ubuntu.amd.Dockerfile @@ -123,4 +123,3 @@ LABEL description="TheRock PyTorch Benchmark - The HIP Environment and ROCm Kit LABEL version="nightly" LABEL gpu_architecture="${MAD_SYSTEM_GPU_ARCHITECTURE}" LABEL components="core_runtime,hip_runtime,blas,prim,rand,pytorch" - diff --git a/tests/fixtures/dummy/docker/dummy_torchrun.ubuntu.amd.Dockerfile b/tests/fixtures/dummy/docker/dummy_torchrun.ubuntu.amd.Dockerfile index e195b386..8aaccc86 100644 --- a/tests/fixtures/dummy/docker/dummy_torchrun.ubuntu.amd.Dockerfile +++ b/tests/fixtures/dummy/docker/dummy_torchrun.ubuntu.amd.Dockerfile @@ -43,4 +43,3 @@ RUN rocminfo > /dev/null 2>&1 || echo "ROCm info check failed (expected in non-G # Note: The K8s deployment config should override these env vars if needed: # - MIOPEN_FIND_MODE is already set in deployment_config.env_vars # - MIOPEN_USER_DB_PATH is already set in deployment_config.env_vars - diff --git a/tests/fixtures/dummy/docker/dummy_torchtitan.ubuntu.amd.Dockerfile b/tests/fixtures/dummy/docker/dummy_torchtitan.ubuntu.amd.Dockerfile index 48deaa06..1d333905 100644 --- a/tests/fixtures/dummy/docker/dummy_torchtitan.ubuntu.amd.Dockerfile +++ b/tests/fixtures/dummy/docker/dummy_torchtitan.ubuntu.amd.Dockerfile @@ -69,4 +69,3 @@ RUN python3 -c "import torch; print(f'✓ PyTorch version: {torch.__version__}') rocminfo > /dev/null 2>&1 || echo "ROCm check (OK in build env)" WORKDIR /workspace - diff --git a/tests/fixtures/dummy/docker/dummy_vllm.ubuntu.amd.Dockerfile b/tests/fixtures/dummy/docker/dummy_vllm.ubuntu.amd.Dockerfile index cf180092..480dfc12 100644 --- a/tests/fixtures/dummy/docker/dummy_vllm.ubuntu.amd.Dockerfile +++ b/tests/fixtures/dummy/docker/dummy_vllm.ubuntu.amd.Dockerfile @@ -93,4 +93,3 @@ RUN echo "=======================================" && \ echo "PyTorch Version: $(python3 -c 'import torch; print(torch.__version__)')" && \ echo "Build Type: Production (Real vLLM with ROCm)" && \ echo "=======================================" - diff --git a/tests/fixtures/dummy/models.json b/tests/fixtures/dummy/models.json index 140779ab..48942f26 100644 --- a/tests/fixtures/dummy/models.json +++ b/tests/fixtures/dummy/models.json @@ -255,7 +255,7 @@ "name": "dummy_torchrun", "dockerfile": "docker/dummy_torchrun", "scripts": "scripts/dummy_torchrun/run.sh", - "n_gpus": "1", + "n_gpus": "4", "owner": "mad.support@amd.com", "training_precision": "", "tags": [ @@ -268,7 +268,7 @@ "name": "dummy_torchrun_multi", "dockerfile": "docker/dummy_torchrun", "scripts": "scripts/dummy_torchrun/run_multi.sh", - "n_gpus": "1", + "n_gpus": "4", "owner": "mad.support@amd.com", "training_precision": "", "tags": [ diff --git a/tests/fixtures/dummy/scripts/dummy/run.sh b/tests/fixtures/dummy/scripts/dummy/run.sh index e5db9e7b..2c9893f7 100644 --- a/tests/fixtures/dummy/scripts/dummy/run.sh +++ b/tests/fixtures/dummy/scripts/dummy/run.sh @@ -1,7 +1,7 @@ #!/bin/bash -# +# # Copyright (c) Advanced Micro Devices, Inc. # All rights reserved. -# +# echo "performance: $RANDOM samples_per_second" diff --git a/tests/fixtures/dummy/scripts/dummy/run_cpu_bind.sh b/tests/fixtures/dummy/scripts/dummy/run_cpu_bind.sh index 1206d310..f4e57df9 100644 --- a/tests/fixtures/dummy/scripts/dummy/run_cpu_bind.sh +++ b/tests/fixtures/dummy/scripts/dummy/run_cpu_bind.sh @@ -1,8 +1,8 @@ #!/bin/bash -# +# # Copyright (c) Advanced Micro Devices, Inc. # All rights reserved. -# +# cpus="" if [ -f "/sys/fs/cgroup/cpuset/cpuset.cpus" ]; then diff --git a/tests/fixtures/dummy/scripts/dummy/run_ctxtest.sh b/tests/fixtures/dummy/scripts/dummy/run_ctxtest.sh index c69d8d1b..d6a43a86 100644 --- a/tests/fixtures/dummy/scripts/dummy/run_ctxtest.sh +++ b/tests/fixtures/dummy/scripts/dummy/run_ctxtest.sh @@ -1,7 +1,7 @@ #!/bin/bash -# +# # Copyright (c) Advanced Micro Devices, Inc. # All rights reserved. -# +# echo "performance: ${ctxtest} context" diff --git a/tests/fixtures/dummy/scripts/dummy/run_data_aws.sh b/tests/fixtures/dummy/scripts/dummy/run_data_aws.sh index ab0a8641..6a66f296 100644 --- a/tests/fixtures/dummy/scripts/dummy/run_data_aws.sh +++ b/tests/fixtures/dummy/scripts/dummy/run_data_aws.sh @@ -6,5 +6,3 @@ else echo "${MAD_DATAHOME}/bert-large-uncased_L_24_H_1024_A_16_V_30528_S_512_Dp_0.1_optimized_layer_norm_opset12.onnx is NOT present" exit 1 fi - - diff --git a/tests/fixtures/dummy/scripts/dummy/run_data_local.sh b/tests/fixtures/dummy/scripts/dummy/run_data_local.sh index 4c5efd5e..1d0f1286 100644 --- a/tests/fixtures/dummy/scripts/dummy/run_data_local.sh +++ b/tests/fixtures/dummy/scripts/dummy/run_data_local.sh @@ -1,8 +1,8 @@ #!/bin/bash -# +# # Copyright (c) Advanced Micro Devices, Inc. # All rights reserved. -# +# mountCode=`mount | grep "${MAD_DATAHOME} "` if [ -z "$mountCode" ]; then @@ -12,5 +12,3 @@ else echo "${MAD_DATAHOME} is mounted" echo "performance: $RANDOM samples_per_second" fi - - diff --git a/tests/fixtures/dummy/scripts/dummy/run_data_minio.sh b/tests/fixtures/dummy/scripts/dummy/run_data_minio.sh index ce697b39..d0e4caa0 100644 --- a/tests/fixtures/dummy/scripts/dummy/run_data_minio.sh +++ b/tests/fixtures/dummy/scripts/dummy/run_data_minio.sh @@ -4,4 +4,4 @@ if [ -f "${MAD_DATAHOME}/bert-large-uncased_L_24_H_1024_A_16_V_30528_S_512_Dp_0. else echo "${MAD_DATAHOME}/bert-large-uncased_L_24_H_1024_A_16_V_30528_S_512_Dp_0.1_optimized_layer_norm_opset12.onnx is NOT present" exit 1 -fi \ No newline at end of file +fi diff --git a/tests/fixtures/dummy/scripts/dummy/run_data_nas.sh b/tests/fixtures/dummy/scripts/dummy/run_data_nas.sh index 878d9330..e464ffd3 100644 --- a/tests/fixtures/dummy/scripts/dummy/run_data_nas.sh +++ b/tests/fixtures/dummy/scripts/dummy/run_data_nas.sh @@ -1,8 +1,8 @@ #!/bin/bash -# +# # Copyright (c) Advanced Micro Devices, Inc. # All rights reserved. -# +# if [ -z ${MAD_DATAHOME+x} ]; then echo "MAD_DATAHOME is NOT set" @@ -34,5 +34,3 @@ else echo "${MAD_DATAHOME} is mounted" echo "performance: $RANDOM samples_per_second" fi - - diff --git a/tests/fixtures/dummy/scripts/dummy/run_gpu_bind.sh b/tests/fixtures/dummy/scripts/dummy/run_gpu_bind.sh index db82af02..d27f651a 100644 --- a/tests/fixtures/dummy/scripts/dummy/run_gpu_bind.sh +++ b/tests/fixtures/dummy/scripts/dummy/run_gpu_bind.sh @@ -1,8 +1,8 @@ #!/bin/bash -# +# # Copyright (c) Advanced Micro Devices, Inc. # All rights reserved. -# +# node_ids=() diff --git a/tests/fixtures/dummy/scripts/dummy/run_multi.sh b/tests/fixtures/dummy/scripts/dummy/run_multi.sh index 4bc59b36..ed10f5cc 100644 --- a/tests/fixtures/dummy/scripts/dummy/run_multi.sh +++ b/tests/fixtures/dummy/scripts/dummy/run_multi.sh @@ -1,8 +1,8 @@ #!/bin/bash -# +# # Copyright (c) Advanced Micro Devices, Inc. # All rights reserved. -# +# echo "model,temperature,performance,metric 1,$RANDOM,$RANDOM,samples_per_sec diff --git a/tests/fixtures/dummy/scripts/dummy/run_nccl_trace.sh b/tests/fixtures/dummy/scripts/dummy/run_nccl_trace.sh index da2a8798..5eaa5f5e 100644 --- a/tests/fixtures/dummy/scripts/dummy/run_nccl_trace.sh +++ b/tests/fixtures/dummy/scripts/dummy/run_nccl_trace.sh @@ -1,9 +1,9 @@ #!/bin/bash -# +# # Copyright (c) Advanced Micro Devices, Inc. # All rights reserved. -# +# python -c "import torch; import torch.distributed as dist; import os; os.environ['MASTER_ADDR'] = 'localhost'; os.environ['MASTER_PORT'] = '29501'; dist.init_process_group('nccl', rank=0, world_size=1);tensor = torch.arange(1, dtype=torch.int64).cuda(); dist.all_reduce(tensor, op=dist.ReduceOp.SUM); print(tensor[0]); " | tee log.txt - -echo "performance: 1 pass" + +echo "performance: 1 pass" diff --git a/tests/fixtures/dummy/scripts/dummy/run_prof.sh b/tests/fixtures/dummy/scripts/dummy/run_prof.sh index 85a5a05f..b614e679 100644 --- a/tests/fixtures/dummy/scripts/dummy/run_prof.sh +++ b/tests/fixtures/dummy/scripts/dummy/run_prof.sh @@ -1,10 +1,10 @@ #!/bin/bash -# +# # Copyright (c) Advanced Micro Devices, Inc. # All rights reserved. -# +# python -c "import torch; x = torch.ones(10,10).to('cuda'); l = torch.nn.Linear(10,30).cuda(); c = torch.nn.Conv2d(1, 20, 3).cuda(); out1 = l(x); out1 = out1[None, None, :, :] ; out2 = c(out1); print( 'performance=' + str(torch.cuda.memory_allocated(0)) )" | tee log.txt - + performance=$(grep -o "performance=[0-9]*" log.txt | tail -n 1 | sed 's/performance=//') -echo "performance: $performance bytes" +echo "performance: $performance bytes" diff --git a/tests/fixtures/dummy/scripts/dummy/run_sleep.sh b/tests/fixtures/dummy/scripts/dummy/run_sleep.sh index 22b1c179..ec4f6c6f 100644 --- a/tests/fixtures/dummy/scripts/dummy/run_sleep.sh +++ b/tests/fixtures/dummy/scripts/dummy/run_sleep.sh @@ -1,8 +1,8 @@ #!/bin/bash -# +# # Copyright (c) Advanced Micro Devices, Inc. # All rights reserved. -# +# sleep $1 echo "performance: $RANDOM samples_per_second" diff --git a/tests/fixtures/dummy/scripts/dummy2/models.json b/tests/fixtures/dummy/scripts/dummy2/models.json index de114986..75ae208b 100644 --- a/tests/fixtures/dummy/scripts/dummy2/models.json +++ b/tests/fixtures/dummy/scripts/dummy2/models.json @@ -25,4 +25,4 @@ ], "args": "" } -] \ No newline at end of file +] diff --git a/tests/fixtures/dummy/scripts/dummy2/run.sh b/tests/fixtures/dummy/scripts/dummy2/run.sh index e5db9e7b..2c9893f7 100644 --- a/tests/fixtures/dummy/scripts/dummy2/run.sh +++ b/tests/fixtures/dummy/scripts/dummy2/run.sh @@ -1,7 +1,7 @@ #!/bin/bash -# +# # Copyright (c) Advanced Micro Devices, Inc. # All rights reserved. -# +# echo "performance: $RANDOM samples_per_second" diff --git a/tests/fixtures/dummy/scripts/dummy3/get_models_json.py b/tests/fixtures/dummy/scripts/dummy3/get_models_json.py index 425a0b19..6c0c857d 100644 --- a/tests/fixtures/dummy/scripts/dummy3/get_models_json.py +++ b/tests/fixtures/dummy/scripts/dummy3/get_models_json.py @@ -4,6 +4,7 @@ Copyright (c) Advanced Micro Devices, Inc. All rights reserved. """ + from madengine.utils.discover_models import CustomModel Model3Data = CustomModel( @@ -18,20 +19,20 @@ multiple_results="", ) + class Dummy3CustomModel(CustomModel): def update_model(self): - self.dockerfile="docker/dummy" - self.scripts="run.sh" + self.dockerfile = "docker/dummy" + self.scripts = "run.sh" self.n_gpus = "-1" self.owner = "mad.support@amd.com" self.training_precision = "" self.args = "" self.multiple_results = "" -Model4Data = Dummy3CustomModel( - name="model4", - tags = ["dummies", "dummy_test_group_3"] -) + +Model4Data = Dummy3CustomModel(name="model4", tags=["dummies", "dummy_test_group_3"]) + def list_models(): - return [Model3Data, Model4Data] \ No newline at end of file + return [Model3Data, Model4Data] diff --git a/tests/fixtures/dummy/scripts/dummy3/run.sh b/tests/fixtures/dummy/scripts/dummy3/run.sh index e5db9e7b..2c9893f7 100644 --- a/tests/fixtures/dummy/scripts/dummy3/run.sh +++ b/tests/fixtures/dummy/scripts/dummy3/run.sh @@ -1,7 +1,7 @@ #!/bin/bash -# +# # Copyright (c) Advanced Micro Devices, Inc. # All rights reserved. -# +# echo "performance: $RANDOM samples_per_second" diff --git a/tests/fixtures/dummy/scripts/dummy_deepspeed/run.sh b/tests/fixtures/dummy/scripts/dummy_deepspeed/run.sh index aa86bc85..8d3ae199 100644 --- a/tests/fixtures/dummy/scripts/dummy_deepspeed/run.sh +++ b/tests/fixtures/dummy/scripts/dummy_deepspeed/run.sh @@ -29,4 +29,3 @@ $LAUNCHER_CMD run_deepspeed.py --deepspeed_config ds_config.json echo "========================================================================" echo "Training script completed" echo "========================================================================" - diff --git a/tests/fixtures/dummy/scripts/dummy_deepspeed/run_deepspeed.py b/tests/fixtures/dummy/scripts/dummy_deepspeed/run_deepspeed.py index 7851597f..1d0bef66 100755 --- a/tests/fixtures/dummy/scripts/dummy_deepspeed/run_deepspeed.py +++ b/tests/fixtures/dummy/scripts/dummy_deepspeed/run_deepspeed.py @@ -12,15 +12,16 @@ deepspeed --num_gpus=2 run_deepspeed.py """ +import argparse import os +import socket import sys import time -import socket -import argparse + +import deepspeed import torch -import torch.nn as nn import torch.distributed as dist -import deepspeed +import torch.nn as nn # Configuration NUM_EPOCHS = 3 @@ -28,8 +29,10 @@ IMAGE_SIZE = 224 NUM_CLASSES = 1000 + class SimpleModel(nn.Module): """Simple model for DeepSpeed testing""" + def __init__(self, num_classes=1000): super().__init__() self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3) @@ -41,7 +44,7 @@ def __init__(self, num_classes=1000): self.bn3 = nn.BatchNorm2d(256) self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) self.fc = nn.Linear(256, num_classes) - + def forward(self, x): x = self.pool(torch.relu(self.bn1(self.conv1(x)))) x = self.pool(torch.relu(self.bn2(self.conv2(x)))) @@ -50,11 +53,12 @@ def forward(self, x): x = torch.flatten(x, 1) return self.fc(x) + def print_header(args): rank = int(os.environ.get("RANK", 0)) local_rank = int(os.environ.get("LOCAL_RANK", 0)) world_size = int(os.environ.get("WORLD_SIZE", 1)) - + if rank == 0: print("=" * 70) print("DeepSpeed Distributed Training Benchmark") @@ -65,71 +69,75 @@ def print_header(args): print(f"Training: {NUM_EPOCHS} epochs, {NUM_BATCHES} batches/epoch") print("=" * 70) + def train_epoch(model_engine, criterion, epoch): model_engine.train() start_time = time.time() total_loss = 0 - + local_rank = model_engine.local_rank micro_batch_size = model_engine.train_micro_batch_size_per_gpu() - + for batch_idx in range(NUM_BATCHES): # Synthetic data inputs = torch.randn( - micro_batch_size, 3, IMAGE_SIZE, IMAGE_SIZE, - device=model_engine.device + micro_batch_size, 3, IMAGE_SIZE, IMAGE_SIZE, device=model_engine.device ) labels = torch.randint( - 0, NUM_CLASSES, (micro_batch_size,), - device=model_engine.device + 0, NUM_CLASSES, (micro_batch_size,), device=model_engine.device ) - + # Forward pass outputs = model_engine(inputs) loss = criterion(outputs, labels) - + # Backward pass (DeepSpeed handles gradients, optimization) model_engine.backward(loss) model_engine.step() - + total_loss += loss.item() - + if local_rank == 0 and (batch_idx + 1) % 10 == 0: - print(f"Epoch [{epoch+1}] Batch [{batch_idx+1}/{NUM_BATCHES}] Loss: {loss.item():.4f}") - + print( + f"Epoch [{epoch+1}] Batch [{batch_idx+1}/{NUM_BATCHES}] Loss: {loss.item():.4f}" + ) + epoch_time = time.time() - start_time avg_loss = total_loss / NUM_BATCHES - + # Calculate node-local throughput # Get local world size (GPUs per node) local_world_size = int(os.environ.get("LOCAL_WORLD_SIZE", 1)) - + # Node throughput = samples processed by all GPUs on this node node_throughput = (NUM_BATCHES * micro_batch_size * local_world_size) / epoch_time - + return avg_loss, node_throughput + def main(): # Start timer for total test duration test_start_time = time.time() - + # Parse DeepSpeed args parser = argparse.ArgumentParser() # local_rank default should come from environment (set by torchrun) - parser.add_argument('--local_rank', type=int, default=int(os.environ.get('LOCAL_RANK', 0))) - parser.add_argument('--deepspeed_config', type=str, default='ds_config.json') + parser.add_argument( + "--local_rank", type=int, default=int(os.environ.get("LOCAL_RANK", 0)) + ) + parser.add_argument("--deepspeed_config", type=str, default="ds_config.json") args = parser.parse_args() - + # Handle config file path - supports multiple locations for K8s/local execution config_found = False original_config_path = args.deepspeed_config script_dir = os.path.dirname(os.path.abspath(__file__)) - + # Try 1: Check as-is (current directory or absolute path) if os.path.exists(args.deepspeed_config): config_found = True print(f"[Config] Found DeepSpeed config: {args.deepspeed_config}") - + # Try 2: Check relative to script directory (for K8s execution) if not config_found: config_path = os.path.join(script_dir, args.deepspeed_config) @@ -137,80 +145,86 @@ def main(): args.deepspeed_config = config_path config_found = True print(f"[Config] Found DeepSpeed config in script directory: {config_path}") - + # Try 3: Check in scripts/dummy_deepspeed/ directory (for local execution) if not config_found: - local_config_path = os.path.join('scripts/dummy_deepspeed', args.deepspeed_config) + local_config_path = os.path.join( + "scripts/dummy_deepspeed", args.deepspeed_config + ) if os.path.exists(local_config_path): args.deepspeed_config = local_config_path config_found = True - print(f"[Config] Found DeepSpeed config in scripts directory: {local_config_path}") - + print( + f"[Config] Found DeepSpeed config in scripts directory: {local_config_path}" + ) + # Error if not found if not config_found: print(f"\n❌ Error: DeepSpeed config not found!") print(f"Searched for: {original_config_path}") print(f"Locations tried:") print(f" 1. Current directory: {os.getcwd()}/{original_config_path}") - print(f" 2. Script directory: {os.path.join(script_dir, original_config_path)}") + print( + f" 2. Script directory: {os.path.join(script_dir, original_config_path)}" + ) print(f" 3. Scripts directory: scripts/dummy_deepspeed/{original_config_path}") print(f"\nCurrent directory: {os.getcwd()}") print(f"Files in current directory:") try: - for f in os.listdir('.'): + for f in os.listdir("."): print(f" - {f}") except Exception as e: print(f" (Cannot list: {e})") print(f"\nScript location: {os.path.abspath(__file__)}") sys.exit(1) - + print_header(args) - + # Initialize PyTorch distributed backend BEFORE DeepSpeed # This prevents DeepSpeed from trying to use MPI if not dist.is_initialized(): dist.init_process_group(backend="nccl") print(f"✓ PyTorch distributed initialized (backend: nccl)") - + # Create model model = SimpleModel(NUM_CLASSES) - + # Initialize DeepSpeed # Note: When using deepspeed launcher with --deepspeed_config arg, # do NOT pass config parameter to initialize() - it causes a conflict model_engine, optimizer, _, _ = deepspeed.initialize( - args=args, - model=model, - model_parameters=model.parameters() + args=args, model=model, model_parameters=model.parameters() ) - + criterion = nn.CrossEntropyLoss() - + rank = model_engine.local_rank - + if rank == 0: print(f"\n✓ DeepSpeed initialized") print(f" ZeRO Stage: {model_engine.zero_optimization_stage()}") print(f" Micro Batch Size: {model_engine.train_micro_batch_size_per_gpu()}") print(f" Gradient Accumulation: {model_engine.gradient_accumulation_steps()}") print(f"\nStarting training...\n") - + # Get topology information rank = int(os.environ.get("RANK", 0)) local_rank = model_engine.local_rank local_world_size = int(os.environ.get("LOCAL_WORLD_SIZE", 1)) world_size = model_engine.world_size node_rank = rank // local_world_size if local_world_size > 0 else 0 - + # Training loop all_throughputs = [] for epoch in range(NUM_EPOCHS): avg_loss, node_throughput = train_epoch(model_engine, criterion, epoch) all_throughputs.append(node_throughput) - + if local_rank == 0: - print(f"\n[Node {node_rank}] Epoch {epoch+1} Complete: Loss={avg_loss:.4f}, Node Throughput={node_throughput:.2f} samples/sec\n") - + print( + f"\n[Node {node_rank}] Epoch {epoch+1} Complete: Loss={avg_loss:.4f}, Node Throughput={node_throughput:.2f} samples/sec\n" + ) + # ======================================================================== # Node-Local Performance Reporting (NEW - Best Practice) # Each node reports its OWN performance @@ -226,24 +240,26 @@ def main(): print(f"Node Throughput: {avg_node_throughput:.2f} samples_per_second") print(f"ZeRO Stage: {model_engine.zero_optimization_stage()}") print(f"{'='*70}") - + # CRITICAL: Standard output format for madengine parsing print(f"\nperformance: {avg_node_throughput:.2f} samples_per_second") print(f"node_id: {node_rank}") print(f"local_gpus: {local_world_size}") print(f"deepspeed_config: ZeRO_stage={model_engine.zero_optimization_stage()}") - + # Calculate and print test duration test_duration = time.time() - test_start_time print(f"test_duration: {test_duration:.2f}s") - + return 0 + if __name__ == "__main__": try: sys.exit(main()) except Exception as e: print(f"Error: {e}", file=sys.stderr) import traceback + traceback.print_exc() sys.exit(1) diff --git a/tests/fixtures/dummy/scripts/dummy_megatron_lm/run_megatron.py b/tests/fixtures/dummy/scripts/dummy_megatron_lm/run_megatron.py index 70265702..f293c386 100755 --- a/tests/fixtures/dummy/scripts/dummy_megatron_lm/run_megatron.py +++ b/tests/fixtures/dummy/scripts/dummy_megatron_lm/run_megatron.py @@ -16,9 +16,10 @@ """ import os +import socket import sys import time -import socket + import torch import torch.nn as nn @@ -26,12 +27,13 @@ try: from megatron.core import mpu, tensor_parallel from megatron.core.parallel_state import ( - initialize_model_parallel, destroy_model_parallel, - get_tensor_model_parallel_world_size, - get_pipeline_model_parallel_world_size, get_data_parallel_world_size, + get_pipeline_model_parallel_world_size, + get_tensor_model_parallel_world_size, + initialize_model_parallel, ) + MEGATRON_AVAILABLE = True except ImportError: MEGATRON_AVAILABLE = False @@ -57,6 +59,7 @@ pipeline_model_parallel_size = int(os.environ.get("PIPELINE_MODEL_PARALLEL_SIZE", 1)) context_parallel_size = int(os.environ.get("CONTEXT_PARALLEL_SIZE", 1)) + def print_header(tp_size, pp_size, dp_size): """Print training configuration header""" print("=" * 70) @@ -77,110 +80,118 @@ def print_header(tp_size, pp_size, dp_size): print(f" Hidden Size: {HIDDEN_SIZE}") print("=" * 70) + class SimpleMegatronModel(nn.Module): """ Simplified model using Megatron-style patterns. In production, use megatron.core.models for actual transformer implementations. """ + def __init__(self, hidden_size, num_classes): super().__init__() self.embedding = nn.Linear(SEQ_LENGTH, hidden_size) - + # Simple transformer layers self.transformer = nn.TransformerEncoder( nn.TransformerEncoderLayer( d_model=hidden_size, nhead=8, dim_feedforward=hidden_size * 4, - batch_first=True + batch_first=True, ), - num_layers=6 + num_layers=6, ) self.classifier = nn.Linear(hidden_size, num_classes) - + def forward(self, x): x = self.embedding(x) x = self.transformer(x) x = x.mean(dim=1) # Global pooling return self.classifier(x) + def train_epoch(model, optimizer, criterion, epoch, device, local_dp_size): """Training loop for one epoch with node-local throughput""" model.train() start_time = time.time() total_loss = 0 - + for batch_idx in range(NUM_BATCHES): # Generate synthetic data inputs = torch.randn(BATCH_SIZE, 1, SEQ_LENGTH, device=device) labels = torch.randint(0, NUM_CLASSES, (BATCH_SIZE,), device=device) - + # Forward pass optimizer.zero_grad() outputs = model(inputs) loss = criterion(outputs, labels) - + # Backward pass loss.backward() - + # Optimizer step optimizer.step() - + total_loss += loss.item() - + # Log progress from local_rank 0 if local_rank == 0 and (batch_idx + 1) % 10 == 0: - print(f"Epoch [{epoch+1}/{NUM_EPOCHS}] " - f"Batch [{batch_idx+1}/{NUM_BATCHES}] " - f"Loss: {loss.item():.4f}") - + print( + f"Epoch [{epoch+1}/{NUM_EPOCHS}] " + f"Batch [{batch_idx+1}/{NUM_BATCHES}] " + f"Loss: {loss.item():.4f}" + ) + epoch_time = time.time() - start_time avg_loss = total_loss / NUM_BATCHES - + # Calculate node-local throughput # local_dp_size = data parallel size on this node node_throughput = (NUM_BATCHES * BATCH_SIZE * local_dp_size) / epoch_time - + return avg_loss, node_throughput + def main(): """Main training function using Megatron-Core""" # Start timer for total test duration test_start_time = time.time() - + # Set device device = torch.device(f"cuda:{local_rank}" if torch.cuda.is_available() else "cpu") if torch.cuda.is_available(): torch.cuda.set_device(device) - + # Initialize distributed and model parallelism if MEGATRON_AVAILABLE and world_size > 1: # Initialize with Megatron-Core if rank == 0: print(f"[Rank {rank}] Initializing Megatron-Core model parallelism...") - + torch.distributed.init_process_group(backend="nccl", init_method="env://") - + # Initialize Megatron model parallel groups initialize_model_parallel( tensor_model_parallel_size=tensor_model_parallel_size, pipeline_model_parallel_size=pipeline_model_parallel_size, context_parallel_size=context_parallel_size, ) - + # Get actual parallel sizes from Megatron-Core tp_size = get_tensor_model_parallel_world_size() pp_size = get_pipeline_model_parallel_world_size() dp_size = get_data_parallel_world_size() - + if rank == 0: print(f"[Rank {rank}] ✓ Megatron-Core initialized") print(f"[Rank {rank}] TP={tp_size}, PP={pp_size}, DP={dp_size}") - + elif world_size > 1: # Fallback to basic DDP if rank == 0: - print(f"[Rank {rank}] Using basic PyTorch DDP (Megatron-Core not available)") + print( + f"[Rank {rank}] Using basic PyTorch DDP (Megatron-Core not available)" + ) torch.distributed.init_process_group(backend="nccl", init_method="env://") tp_size = 1 pp_size = 1 @@ -190,45 +201,50 @@ def main(): tp_size = 1 pp_size = 1 dp_size = 1 - + # Print configuration print_header(tp_size, pp_size, dp_size) - + if torch.cuda.is_available(): print(f"[Rank {rank}] Using GPU: {torch.cuda.get_device_name(device)}") - + # Create model model = SimpleMegatronModel(HIDDEN_SIZE, NUM_CLASSES).to(device) - + # Wrap with DDP if needed (in production, use Megatron's model wrappers) if world_size > 1 and not MEGATRON_AVAILABLE: from torch.nn.parallel import DistributedDataParallel as DDP + model = DDP(model, device_ids=[local_rank], output_device=local_rank) - + # Optimizer and loss optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=0.01) criterion = nn.CrossEntropyLoss() - + # Get local world size and node rank local_world_size = int(os.environ.get("LOCAL_WORLD_SIZE", 1)) node_rank = rank // local_world_size if local_world_size > 0 else 0 - + # Calculate local data parallel size (DP ranks on this node) # In Megatron: DP = world_size / (TP * PP * CP) # For simplicity, assume local_dp_size proportional to local_world_size - local_dp_size = dp_size // (world_size // local_world_size) if (world_size // local_world_size) > 0 else dp_size + local_dp_size = ( + dp_size // (world_size // local_world_size) + if (world_size // local_world_size) > 0 + else dp_size + ) if local_dp_size < 1: local_dp_size = 1 - + # Synchronize before training if world_size > 1: torch.distributed.barrier() - + if local_rank == 0: print(f"\n{'='*70}") print(f"[Node {node_rank}] Starting Training") print(f"{'='*70}\n") - + # Training loop all_throughputs = [] for epoch in range(NUM_EPOCHS): @@ -236,12 +252,12 @@ def main(): model, optimizer, criterion, epoch, device, local_dp_size ) all_throughputs.append(node_throughput) - + if local_rank == 0: print(f"\n[Node {node_rank}] Epoch {epoch+1}/{NUM_EPOCHS} Complete:") print(f" Loss: {avg_loss:.4f}") print(f" Node Throughput: {node_throughput:.2f} samples/sec\n") - + # ======================================================================== # Node-Local Performance Reporting (NEW - Best Practice) # ======================================================================== @@ -260,33 +276,37 @@ def main(): print(f" Context Parallel (CP): {context_parallel_size}") print(f" Data Parallel (DP): {dp_size}") print(f"{'='*70}") - + # CRITICAL: Standard output format for madengine parsing print(f"\nperformance: {avg_node_throughput:.2f} samples_per_second") print(f"node_id: {node_rank}") print(f"local_gpus: {local_world_size}") - print(f"megatron_config: TP={tp_size} PP={pp_size} CP={context_parallel_size} DP={dp_size}") - + print( + f"megatron_config: TP={tp_size} PP={pp_size} CP={context_parallel_size} DP={dp_size}" + ) + # Calculate and print test duration test_duration = time.time() - test_start_time print(f"test_duration: {test_duration:.2f}s") - + # Cleanup if MEGATRON_AVAILABLE and world_size > 1: destroy_model_parallel() - + if world_size > 1: torch.distributed.destroy_process_group() if rank == 0: print(f"\n✓ Distributed cleanup complete") - + return 0 + if __name__ == "__main__": try: sys.exit(main()) except Exception as e: print(f"[Rank {rank}] Error: {e}", file=sys.stderr) import traceback + traceback.print_exc() sys.exit(1) diff --git a/tests/fixtures/dummy/scripts/dummy_sglang/run_sglang_inference.py b/tests/fixtures/dummy/scripts/dummy_sglang/run_sglang_inference.py index be68a74d..b5c6fa74 100644 --- a/tests/fixtures/dummy/scripts/dummy_sglang/run_sglang_inference.py +++ b/tests/fixtures/dummy/scripts/dummy_sglang/run_sglang_inference.py @@ -12,11 +12,11 @@ Multi-node: One serve per node (TP only on that node), nnodes=1 per process """ +import argparse import os +import socket import sys import time -import argparse -import socket from typing import List, Optional # Configure environment before importing SGLang @@ -78,26 +78,26 @@ def generate_prompts(num_prompts: int) -> List[str]: def run_inference_sglang(args): """ Run SGLang inference using native Runtime API. - + SGLang handles distributed setup automatically via Ray. No torchrun needed! """ print("\n" + "=" * 70) print("Initializing SGLang Runtime") print("=" * 70) - + try: # Initialize SGLang runtime # SGLang automatically handles multi-node setup via Ray # when appropriate environment variables are set - + runtime_config = { "model_path": args.model, "tp_size": args.tp_size, "trust_remote_code": True, "mem_fraction_static": 0.90, } - + # For multi-node, set Ray init address if args.nnodes > 1: runtime_config["nccl_init_addr"] = f"{args.master_addr}:{args.master_port}" @@ -106,19 +106,19 @@ def run_inference_sglang(args): print(f"Multi-node setup: {args.nnodes} nodes, rank {args.node_rank}") else: print(f"Single-node setup: {args.tp_size} GPUs") - + # Initialize runtime runtime = sgl.Runtime(**runtime_config) print("✓ SGLang runtime initialized successfully") - + except Exception as e: print(f"✗ Failed to initialize SGLang runtime: {e}") print("\n⚠️ Falling back to mock inference for testing...") return run_inference_mock(args) - + # Generate prompts prompts = generate_prompts(NUM_PROMPTS) - + # Warmup print("\nWarmup: Running 10 prompts...") warmup_prompts = prompts[:10] @@ -129,16 +129,16 @@ def run_inference_sglang(args): "max_new_tokens": MAX_TOKENS, "temperature": TEMPERATURE, "top_p": TOP_P, - } + }, ) print("✓ Warmup complete") except Exception as e: print(f"⚠️ Warmup failed: {e}") - + # Benchmark print(f"\nBenchmark: Running {NUM_PROMPTS} prompts...") start_time = time.time() - + try: outputs = runtime.generate( prompts, @@ -146,17 +146,19 @@ def run_inference_sglang(args): "max_new_tokens": MAX_TOKENS, "temperature": TEMPERATURE, "top_p": TOP_P, - } + }, ) - + end_time = time.time() elapsed_time = end_time - start_time - + # Calculate metrics - total_tokens = sum(len(output["meta_info"]["completion_tokens"]) for output in outputs) + total_tokens = sum( + len(output["meta_info"]["completion_tokens"]) for output in outputs + ) throughput = NUM_PROMPTS / elapsed_time tokens_per_second = total_tokens / elapsed_time - + # Print results print(f"\n{'=' * 70}") print("Benchmark Results") @@ -167,7 +169,7 @@ def run_inference_sglang(args): print(f"Token generation: {tokens_per_second:.2f} tokens/second") print(f"Average latency: {(elapsed_time / NUM_PROMPTS) * 1000:.2f} ms/request") print("=" * 70) - + # Print sample outputs print("\n" + "=" * 70) print("Sample Outputs (first 3)") @@ -177,22 +179,23 @@ def run_inference_sglang(args): generated_text = output["text"] print(f"\n[Prompt {i+1}]: {prompt}") print(f"[Output {i+1}]: {generated_text[:200]}...") - + # madengine output format print(f"\nperformance: {throughput:.2f} requests_per_second") print(f"tokens_per_second: {tokens_per_second:.2f}") print(f"model: {args.model}") print(f"tp_size: {args.tp_size}") print(f"nnodes: {args.nnodes}") - + # Cleanup runtime.shutdown() - + return 0 - + except Exception as e: print(f"✗ Inference failed: {e}") import traceback + traceback.print_exc() print("\n⚠️ Falling back to mock inference...") return run_inference_mock(args) @@ -207,35 +210,35 @@ def run_inference_mock(args): print("=" * 70) print("This simulates SGLang inference for testing madengine infrastructure.") print("=" * 70) - + # Simulate initialization print("\nInitializing mock SGLang runtime...") time.sleep(1) print("✓ Mock runtime initialized") - + # Generate prompts prompts = generate_prompts(NUM_PROMPTS) - + # Warmup print("\nWarmup: Running 10 prompts...") time.sleep(0.5) print("✓ Warmup complete") - + # Benchmark print(f"\nBenchmark: Running {NUM_PROMPTS} prompts...") start_time = time.time() - + # Simulate inference time.sleep(2.0) - + end_time = time.time() elapsed_time = end_time - start_time - + # Mock metrics total_tokens = NUM_PROMPTS * MAX_TOKENS throughput = NUM_PROMPTS / elapsed_time tokens_per_second = total_tokens / elapsed_time - + # Print results print(f"\n{'=' * 70}") print("Benchmark Results (Mock)") @@ -246,7 +249,7 @@ def run_inference_mock(args): print(f"Token generation: {tokens_per_second:.2f} tokens/second") print(f"Average latency: {(elapsed_time / NUM_PROMPTS) * 1000:.2f} ms/request") print("=" * 70) - + # Print sample outputs print("\n" + "=" * 70) print("Sample Outputs (Mock - first 3)") @@ -254,14 +257,14 @@ def run_inference_mock(args): for i in range(3): print(f"\n[Prompt {i+1}]: {prompts[i]}") print(f"[Output {i+1}]: [Mock generated text for infrastructure testing...]") - + # madengine output format print(f"\nperformance: {throughput:.2f} requests_per_second") print(f"tokens_per_second: {tokens_per_second:.2f}") print(f"model: {args.model}") print(f"tp_size: {args.tp_size}") print(f"nnodes: {args.nnodes}") - + return 0 @@ -274,62 +277,56 @@ def main(): "--model", type=str, default=DEFAULT_MODEL, - help=f"Model name or path (default: {DEFAULT_MODEL})" + help=f"Model name or path (default: {DEFAULT_MODEL})", ) parser.add_argument( "--tp-size", type=int, default=1, - help="Tensor parallel size (GPUs per node, default: 1)" + help="Tensor parallel size (GPUs per node, default: 1)", ) parser.add_argument( - "--nnodes", - type=int, - default=1, - help="Number of nodes (default: 1)" + "--nnodes", type=int, default=1, help="Number of nodes (default: 1)" ) parser.add_argument( - "--node-rank", - type=int, - default=0, - help="Node rank (0-indexed, default: 0)" + "--node-rank", type=int, default=0, help="Node rank (0-indexed, default: 0)" ) parser.add_argument( "--master-addr", type=str, default="localhost", - help="Master node address (default: localhost)" + help="Master node address (default: localhost)", ) parser.add_argument( "--master-port", type=int, default=29500, - help="Master communication port (default: 29500)" + help="Master communication port (default: 29500)", ) parser.add_argument( "--mock-only", action="store_true", - help="Force mock inference (skip real SGLang)" + help="Force mock inference (skip real SGLang)", ) - + args = parser.parse_args() - + # Validate arguments if args.tp_size < 1: print("Error: tp-size must be >= 1") return 1 - + if args.nnodes < 1: print("Error: nnodes must be >= 1") return 1 - + if args.node_rank < 0 or args.node_rank >= args.nnodes: print(f"Error: node-rank must be in range [0, {args.nnodes-1}]") return 1 - + # Print configuration print_header(args) - + # Run inference if args.mock_only: return run_inference_mock(args) @@ -346,5 +343,6 @@ def main(): except Exception as e: print(f"\nError: {e}", file=sys.stderr) import traceback + traceback.print_exc() sys.exit(1) diff --git a/tests/fixtures/dummy/scripts/dummy_sglang_disagg/requirements.txt b/tests/fixtures/dummy/scripts/dummy_sglang_disagg/requirements.txt index 25f8ad69..95ffe1d7 100644 --- a/tests/fixtures/dummy/scripts/dummy_sglang_disagg/requirements.txt +++ b/tests/fixtures/dummy/scripts/dummy_sglang_disagg/requirements.txt @@ -1,3 +1,2 @@ # Minimal requirements for dummy test # No actual SGLang needed - this is a simulation - diff --git a/tests/fixtures/dummy/scripts/dummy_sglang_disagg/run.sh b/tests/fixtures/dummy/scripts/dummy_sglang_disagg/run.sh index 9661fc17..5d24f16e 100755 --- a/tests/fixtures/dummy/scripts/dummy_sglang_disagg/run.sh +++ b/tests/fixtures/dummy/scripts/dummy_sglang_disagg/run.sh @@ -14,7 +14,7 @@ if [ "${SGLANG_DISAGG_MODE:-}" = "enabled" ]; then echo " Node Rank: ${SGLANG_NODE_RANK:-unknown}" echo " Prefill Nodes: ${SGLANG_DISAGG_PREFILL_NODES:-unknown}" echo " Decode Nodes: ${SGLANG_DISAGG_DECODE_NODES:-unknown}" - + # Run Python script that handles node roles python3 run_sglang_disagg_inference.py else @@ -26,4 +26,3 @@ fi echo "============================================" echo "✓ SGLang Disagg Test Complete" echo "============================================" - diff --git a/tests/fixtures/dummy/scripts/dummy_sglang_disagg/run_sglang_disagg_inference.py b/tests/fixtures/dummy/scripts/dummy_sglang_disagg/run_sglang_disagg_inference.py index 94b476b6..7e65d235 100755 --- a/tests/fixtures/dummy/scripts/dummy_sglang_disagg/run_sglang_disagg_inference.py +++ b/tests/fixtures/dummy/scripts/dummy_sglang_disagg/run_sglang_disagg_inference.py @@ -8,9 +8,9 @@ """ import os +import socket import sys import time -import socket from typing import Optional @@ -48,23 +48,23 @@ def simulate_proxy_node(info: dict): print(f"Prefill Nodes: {info['prefill_nodes']}") print(f"Decode Nodes: {info['decode_nodes']}") print("-" * 60) - + print("\n[Proxy] Initializing load balancer...") time.sleep(1) - + print("[Proxy] Waiting for prefill nodes to be ready...") - for i in range(1, info['prefill_nodes'] + 1): + for i in range(1, info["prefill_nodes"] + 1): print(f" ✓ Prefill node {i} connected") time.sleep(0.5) - + print("[Proxy] Waiting for decode nodes to be ready...") - for i in range(info['prefill_nodes'] + 1, info['total_nodes']): + for i in range(info["prefill_nodes"] + 1, info["total_nodes"]): print(f" ✓ Decode node {i} connected") time.sleep(0.5) - + print("\n[Proxy] All nodes connected. Load balancer ready!") print("[Proxy] Simulating request routing...") - + # Simulate some requests for req_id in range(1, 4): print(f"\n[Proxy] Request {req_id}:") @@ -72,10 +72,12 @@ def simulate_proxy_node(info: dict): time.sleep(0.3) print(f" → KV cache transferred via Mooncake") time.sleep(0.3) - print(f" → Routing to decode node {info['prefill_nodes'] + ((req_id % info['decode_nodes']) + 1)}") + print( + f" → Routing to decode node {info['prefill_nodes'] + ((req_id % info['decode_nodes']) + 1)}" + ) time.sleep(0.3) print(f" ✓ Request {req_id} completed") - + print("\n[Proxy] Test complete. Shutting down...") @@ -89,18 +91,18 @@ def simulate_prefill_node(info: dict): print(f"Tensor Parallel Size: {info['tp_size']}") print(f"Role: Prompt Processing") print("-" * 60) - + print("\n[Prefill] Initializing prefill server...") time.sleep(1) - + print("[Prefill] Loading model shards...") - for shard in range(info['tp_size']): + for shard in range(info["tp_size"]): print(f" ✓ Shard {shard + 1}/{info['tp_size']} loaded") time.sleep(0.3) - + print("\n[Prefill] Server ready. Listening for requests...") time.sleep(1) - + print("[Prefill] Processing prompts...") for batch in range(1, 4): print(f"\n[Prefill] Batch {batch}:") @@ -111,7 +113,7 @@ def simulate_prefill_node(info: dict): print(f" → Transferring KV cache via Mooncake...") time.sleep(0.3) print(f" ✓ Batch {batch} complete") - + print("\n[Prefill] Test complete. Shutting down...") @@ -125,18 +127,18 @@ def simulate_decode_node(info: dict): print(f"Tensor Parallel Size: {info['tp_size']}") print(f"Role: Token Generation") print("-" * 60) - + print("\n[Decode] Initializing decode server...") time.sleep(1) - + print("[Decode] Loading model shards...") - for shard in range(info['tp_size']): + for shard in range(info["tp_size"]): print(f" ✓ Shard {shard + 1}/{info['tp_size']} loaded") time.sleep(0.3) - + print("\n[Decode] Server ready. Listening for KV caches...") time.sleep(1) - + print("[Decode] Generating tokens...") for batch in range(1, 4): print(f"\n[Decode] Batch {batch}:") @@ -148,7 +150,7 @@ def simulate_decode_node(info: dict): time.sleep(0.2) print(f" ✓ Generated 5 tokens") print(f" ✓ Batch {batch} complete") - + print("\n[Decode] Test complete. Shutting down...") @@ -157,22 +159,24 @@ def main(): print("\n" + "=" * 60) print("SGLang Disaggregated Inference Simulation") print("=" * 60 + "\n") - + # Get node information info = get_node_info() role = determine_node_role(info["node_rank"], info["prefill_nodes"]) - + print(f"Cluster Configuration:") print(f" Total Nodes: {info['total_nodes']}") print(f" Prefill Nodes: {info['prefill_nodes']} (ranks 1-{info['prefill_nodes']})") - print(f" Decode Nodes: {info['decode_nodes']} (ranks {info['prefill_nodes']+1}-{info['total_nodes']-1})") + print( + f" Decode Nodes: {info['decode_nodes']} (ranks {info['prefill_nodes']+1}-{info['total_nodes']-1})" + ) print(f" Proxy Node: 1 (rank 0)") print(f"\nThis Node:") print(f" Rank: {info['node_rank']}") print(f" Role: {role.upper()}") print(f" Hostname: {info['hostname']}") print() - + # Simulate based on role try: if role == "proxy": @@ -184,22 +188,22 @@ def main(): else: print(f"❌ ERROR: Unknown role '{role}'") sys.exit(1) - + print("\n" + "=" * 60) print("✅ Simulation Complete") print("=" * 60) return 0 - + except KeyboardInterrupt: print("\n\n⚠️ Interrupted by user") return 130 except Exception as e: print(f"\n❌ ERROR: {e}") import traceback + traceback.print_exc() return 1 if __name__ == "__main__": sys.exit(main()) - diff --git a/tests/fixtures/dummy/scripts/dummy_therock/run.sh b/tests/fixtures/dummy/scripts/dummy_therock/run.sh index 12cafac4..8d52714f 100755 --- a/tests/fixtures/dummy/scripts/dummy_therock/run.sh +++ b/tests/fixtures/dummy/scripts/dummy_therock/run.sh @@ -45,4 +45,3 @@ echo "" echo "========================================================================" echo "Benchmark completed!" echo "========================================================================" - diff --git a/tests/fixtures/dummy/scripts/dummy_therock/train_resnet.py b/tests/fixtures/dummy/scripts/dummy_therock/train_resnet.py index c90fe482..25c8ad27 100755 --- a/tests/fixtures/dummy/scripts/dummy_therock/train_resnet.py +++ b/tests/fixtures/dummy/scripts/dummy_therock/train_resnet.py @@ -5,11 +5,12 @@ This script benchmarks ResNet50 training performance using PyTorch on TheRock's ROCm distribution. """ +import sys +import time + import torch import torch.nn as nn import torchvision.models as models -import time -import sys # Configuration BATCH_SIZE = 64 @@ -21,80 +22,80 @@ def main(): print("=" * 70) print("ResNet50 Training Benchmark (TheRock)") print("=" * 70) - + # Setup device device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print(f"Device: {device}") - + if torch.cuda.is_available(): print(f"GPU: {torch.cuda.get_device_name(0)}") print(f"GPU Count: {torch.cuda.device_count()}") - + # Create model print("\nCreating ResNet50 model...") model = models.resnet50(pretrained=False, num_classes=1000).to(device) model.train() - + # Setup optimizer and loss optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9) criterion = nn.CrossEntropyLoss() - + print(f"Batch Size: {BATCH_SIZE}") print(f"Iterations: {NUM_ITERATIONS}") print(f"Image Size: {IMAGE_SIZE}x{IMAGE_SIZE}") - + # Warmup print("\nWarming up (10 iterations)...") for _ in range(10): images = torch.randn(BATCH_SIZE, 3, IMAGE_SIZE, IMAGE_SIZE, device=device) labels = torch.randint(0, 1000, (BATCH_SIZE,), device=device) - + optimizer.zero_grad() outputs = model(images) loss = criterion(outputs, labels) loss.backward() optimizer.step() - + if torch.cuda.is_available(): torch.cuda.synchronize() - + # Benchmark print(f"Running benchmark ({NUM_ITERATIONS} iterations)...") start_time = time.time() - + for i in range(NUM_ITERATIONS): images = torch.randn(BATCH_SIZE, 3, IMAGE_SIZE, IMAGE_SIZE, device=device) labels = torch.randint(0, 1000, (BATCH_SIZE,), device=device) - + optimizer.zero_grad() outputs = model(images) loss = criterion(outputs, labels) loss.backward() optimizer.step() - + if (i + 1) % 20 == 0: print(f" Progress: {i + 1}/{NUM_ITERATIONS}") - + if torch.cuda.is_available(): torch.cuda.synchronize() - + end_time = time.time() - + # Calculate metrics duration = end_time - start_time total_images = BATCH_SIZE * NUM_ITERATIONS images_per_sec = total_images / duration - + print("\n" + "=" * 70) print("Benchmark Results:") print(f" Total Images Processed: {total_images}") print(f" Duration: {duration:.2f} seconds") print(f" Throughput: {images_per_sec:.2f} images/sec") print("=" * 70) - + # madengine performance output (required format) print(f"\nperformance: {images_per_sec:.2f} images_per_second") - + return 0 @@ -104,6 +105,6 @@ def main(): except Exception as e: print(f"Error: {e}", file=sys.stderr) import traceback + traceback.print_exc() sys.exit(1) - diff --git a/tests/fixtures/dummy/scripts/dummy_torchrun/helper.py b/tests/fixtures/dummy/scripts/dummy_torchrun/helper.py index e705ce30..287d4d3f 100644 --- a/tests/fixtures/dummy/scripts/dummy_torchrun/helper.py +++ b/tests/fixtures/dummy/scripts/dummy_torchrun/helper.py @@ -15,24 +15,33 @@ class ResidualBlock(nn.Module): """Residual block with skip connection""" + def __init__(self, in_channels, out_channels, stride=1): super(ResidualBlock, self).__init__() - self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, - stride=stride, padding=1, bias=False) + self.conv1 = nn.Conv2d( + in_channels, + out_channels, + kernel_size=3, + stride=stride, + padding=1, + bias=False, + ) self.bn1 = nn.BatchNorm2d(out_channels) - self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, - stride=1, padding=1, bias=False) + self.conv2 = nn.Conv2d( + out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False + ) self.bn2 = nn.BatchNorm2d(out_channels) - + # Skip connection self.skip = nn.Sequential() if stride != 1 or in_channels != out_channels: self.skip = nn.Sequential( - nn.Conv2d(in_channels, out_channels, kernel_size=1, - stride=stride, bias=False), - nn.BatchNorm2d(out_channels) + nn.Conv2d( + in_channels, out_channels, kernel_size=1, stride=stride, bias=False + ), + nn.BatchNorm2d(out_channels), ) - + def forward(self, x): out = F.relu(self.bn1(self.conv1(x))) out = self.bn2(self.conv2(out)) @@ -44,29 +53,30 @@ def forward(self, x): class ResNetModel(nn.Module): """ ResNet-style model for distributed training benchmark. - + This is a more realistic model architecture compared to SimpleCNN, demonstrating residual connections and deeper networks. """ + def __init__(self, num_classes=1000, num_blocks=[2, 2, 2, 2]): super(ResNetModel, self).__init__() self.in_channels = 64 - + # Initial convolution self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False) self.bn1 = nn.BatchNorm2d(64) self.pool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) - + # Residual layers self.layer1 = self._make_layer(64, num_blocks[0], stride=1) self.layer2 = self._make_layer(128, num_blocks[1], stride=2) self.layer3 = self._make_layer(256, num_blocks[2], stride=2) self.layer4 = self._make_layer(512, num_blocks[3], stride=2) - + # Classification head self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) self.fc = nn.Linear(512, num_classes) - + def _make_layer(self, out_channels, num_blocks, stride): """Create a layer with multiple residual blocks""" strides = [stride] + [1] * (num_blocks - 1) @@ -75,7 +85,7 @@ def _make_layer(self, out_channels, num_blocks, stride): layers.append(ResidualBlock(self.in_channels, out_channels, stride)) self.in_channels = out_channels return nn.Sequential(*layers) - + def forward(self, x): out = self.pool(F.relu(self.bn1(self.conv1(x)))) out = self.layer1(out) @@ -91,31 +101,33 @@ def forward(self, x): class SyntheticDataset: """ Synthetic dataset generator for benchmarking. - + Generates random data on-the-fly to avoid I/O bottlenecks and provide consistent benchmarking results. """ + def __init__(self, num_samples, batch_size, image_size=224, num_classes=1000): self.num_samples = num_samples self.batch_size = batch_size self.image_size = image_size self.num_classes = num_classes self.num_batches = num_samples // batch_size - + def generate_batch(self, device): """Generate a synthetic batch of images and labels""" - images = torch.randn(self.batch_size, 3, self.image_size, - self.image_size, device=device) - labels = torch.randint(0, self.num_classes, (self.batch_size,), - device=device) + images = torch.randn( + self.batch_size, 3, self.image_size, self.image_size, device=device + ) + labels = torch.randint(0, self.num_classes, (self.batch_size,), device=device) return images, labels - + def __len__(self): return self.num_batches class BenchmarkConfig: """Configuration for distributed training benchmark""" + def __init__(self): # Training hyperparameters self.batch_size = 128 @@ -123,16 +135,16 @@ def __init__(self): self.learning_rate = 0.01 self.momentum = 0.9 self.weight_decay = 1e-4 - + # Data configuration self.image_size = 224 self.num_classes = 1000 self.num_batches = 100 - + # Model configuration self.model_type = "resnet" # or "simple_cnn" self.resnet_blocks = [2, 2, 2, 2] # ResNet-18 style - + def __str__(self): return ( f"BenchmarkConfig(\n" @@ -150,7 +162,7 @@ def print_distributed_info(rank, local_rank, world_size): """Print distributed training information""" import socket import os - + print(f"\n[Rank {rank}] Distributed Training Info:") print(f" Hostname: {socket.gethostname()}") print(f" Global Rank: {rank}") @@ -166,7 +178,9 @@ def print_gpu_info(rank, device): print(f"\n[Rank {rank}] GPU Info:") print(f" Device: {device}") print(f" GPU Name: {torch.cuda.get_device_name(device)}") - print(f" GPU Memory: {torch.cuda.get_device_properties(device).total_memory / 1e9:.2f} GB") + print( + f" GPU Memory: {torch.cuda.get_device_properties(device).total_memory / 1e9:.2f} GB" + ) else: print(f"\n[Rank {rank}] Warning: CUDA not available, using CPU") diff --git a/tests/fixtures/dummy/scripts/dummy_torchrun/run.sh b/tests/fixtures/dummy/scripts/dummy_torchrun/run.sh index bc0f2318..fdbffe84 100755 --- a/tests/fixtures/dummy/scripts/dummy_torchrun/run.sh +++ b/tests/fixtures/dummy/scripts/dummy_torchrun/run.sh @@ -19,10 +19,10 @@ cd "$SCRIPT_DIR" if [ -z "$MAD_MULTI_NODE_RUNNER" ]; then # Get number of GPUs from environment N_GPUS="${MAD_RUNTIME_NGPUS:-1}" - + echo "ℹ️ MAD_MULTI_NODE_RUNNER not set, using standalone torchrun" echo "ℹ️ Using $N_GPUS GPUs" - + MAD_MULTI_NODE_RUNNER="torchrun --standalone --nproc_per_node=$N_GPUS" fi diff --git a/tests/fixtures/dummy/scripts/dummy_torchrun/run_torchrun.py b/tests/fixtures/dummy/scripts/dummy_torchrun/run_torchrun.py index 204ae985..69abec8b 100644 --- a/tests/fixtures/dummy/scripts/dummy_torchrun/run_torchrun.py +++ b/tests/fixtures/dummy/scripts/dummy_torchrun/run_torchrun.py @@ -13,22 +13,23 @@ Usage: # Single GPU torchrun --standalone --nproc_per_node=1 run_torchrun.py - + # Multi-GPU (single node) torchrun --standalone --nproc_per_node=8 run_torchrun.py - + # Multi-node (via K8s with torchrun) torchrun --nnodes=4 --nproc_per_node=8 --master_addr=... run_torchrun.py """ import os +import socket import sys import time -import socket + import torch +import torch.distributed as dist import torch.nn as nn import torch.nn.functional as F -import torch.distributed as dist from torch.nn.parallel import DistributedDataParallel as DDP # Configuration @@ -68,21 +69,22 @@ def print_header(): class SimpleCNN(nn.Module): """Simple CNN model for benchmarking""" + def __init__(self, num_classes=1000): super(SimpleCNN, self).__init__() self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3) self.bn1 = nn.BatchNorm2d(64) self.pool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) - + self.conv2 = nn.Conv2d(64, 128, kernel_size=3, padding=1) self.bn2 = nn.BatchNorm2d(128) - + self.conv3 = nn.Conv2d(128, 256, kernel_size=3, padding=1) self.bn3 = nn.BatchNorm2d(256) - + self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) self.fc = nn.Linear(256, num_classes) - + def forward(self, x): x = self.pool(F.relu(self.bn1(self.conv1(x)))) x = self.pool(F.relu(self.bn2(self.conv2(x)))) @@ -106,62 +108,64 @@ def train_epoch(model, optimizer, criterion, epoch, device): epoch_start = time.time() total_samples = 0 total_loss = 0.0 - + for batch_idx in range(NUM_BATCHES): batch_start = time.time() - + # Generate synthetic data images, labels = generate_synthetic_batch(BATCH_SIZE, device) - + # Forward pass optimizer.zero_grad() outputs = model(images) loss = criterion(outputs, labels) - + # Backward pass (gradients are automatically synchronized across GPUs) loss.backward() - + # Update weights optimizer.step() - + batch_time = time.time() - batch_start total_samples += BATCH_SIZE total_loss += loss.item() - + # Print progress from local rank 0 on each node if local_rank == 0 and (batch_idx + 1) % 20 == 0: avg_loss = total_loss / (batch_idx + 1) throughput = BATCH_SIZE / batch_time # Local throughput - print(f"Epoch [{epoch+1}/{NUM_EPOCHS}] " - f"Batch [{batch_idx+1}/{NUM_BATCHES}] " - f"Loss: {loss.item():.4f} " - f"Throughput: {throughput:.2f} samples/sec (local)") - + print( + f"Epoch [{epoch+1}/{NUM_EPOCHS}] " + f"Batch [{batch_idx+1}/{NUM_BATCHES}] " + f"Loss: {loss.item():.4f} " + f"Throughput: {throughput:.2f} samples/sec (local)" + ) + epoch_time = time.time() - epoch_start avg_loss = total_loss / NUM_BATCHES - + # ======================================================================== # Node-Local Throughput Measurement # ======================================================================== # Calculate throughput for ALL GPUs on THIS NODE local_samples = NUM_BATCHES * BATCH_SIZE local_gpu_throughput = local_samples / epoch_time - + # Get local world size (GPUs per node) local_world_size = int(os.environ.get("LOCAL_WORLD_SIZE", 1)) - + # Node throughput = sum of all local GPUs on this node # In data parallel, each GPU processes the same throughput node_throughput = local_gpu_throughput * local_world_size - + # Return metrics dictionary metrics = { - 'avg_loss': avg_loss, - 'node_throughput': node_throughput, - 'epoch_time': epoch_time, - 'local_world_size': local_world_size + "avg_loss": avg_loss, + "node_throughput": node_throughput, + "epoch_time": epoch_time, + "local_world_size": local_world_size, } - + return metrics @@ -169,9 +173,9 @@ def main(): """Main training function""" # Start timer for total test duration test_start_time = time.time() - + print_header() - + # Create per-process MIOpen cache directory to avoid database conflicts # This must be done AFTER torchrun sets LOCAL_RANK environment variable # This prevents "Duplicate ID" errors and database corruption in multi-GPU training @@ -180,10 +184,12 @@ def main(): # Cannot use expandvars() because the template uses ${LOCAL_RANK} syntax miopen_template = os.environ["MIOPEN_USER_DB_PATH"] # Replace ${LOCAL_RANK} or $LOCAL_RANK with actual value - miopen_path = miopen_template.replace("${LOCAL_RANK:-0}", str(local_rank)).replace("$LOCAL_RANK", str(local_rank)) + miopen_path = miopen_template.replace( + "${LOCAL_RANK:-0}", str(local_rank) + ).replace("$LOCAL_RANK", str(local_rank)) os.makedirs(miopen_path, exist_ok=True) print(f"[Rank {rank}] ✓ Created MIOpen cache directory: {miopen_path}") - + # Initialize distributed training if world_size > 1: print(f"\n[Rank {rank}] Initializing distributed process group...") @@ -192,37 +198,41 @@ def main(): backend="nccl", init_method=f"env://", # Use environment variables (set by torchrun) world_size=world_size, - rank=rank + rank=rank, ) print(f"[Rank {rank}] ✓ Process group initialized") print(f"[Rank {rank}] Backend: {dist.get_backend()}") print(f"[Rank {rank}] World Size: {dist.get_world_size()}") else: print(f"\n=== Running in Standalone Mode (Single GPU) ===") - + # Set device if torch.cuda.is_available(): num_gpus = torch.cuda.device_count() print(f"[Rank {rank}] PyTorch sees {num_gpus} GPU(s)") - print(f"[Rank {rank}] LOCAL_RANK={local_rank}, attempting to use cuda:{local_rank}") - + print( + f"[Rank {rank}] LOCAL_RANK={local_rank}, attempting to use cuda:{local_rank}" + ) + if local_rank >= num_gpus: - print(f"[Rank {rank}] ERROR: LOCAL_RANK {local_rank} >= available GPUs {num_gpus}") + print( + f"[Rank {rank}] ERROR: LOCAL_RANK {local_rank} >= available GPUs {num_gpus}" + ) print(f"[Rank {rank}] Using cuda:0 instead") device = torch.device("cuda:0") else: device = torch.device(f"cuda:{local_rank}") - + torch.cuda.set_device(device) print(f"[Rank {rank}] Using GPU: {torch.cuda.get_device_name(device)}") else: device = torch.device("cpu") print(f"[Rank {rank}] Warning: CUDA not available, using CPU") - + # Create model print(f"\n[Rank {rank}] Creating model...") model = SimpleCNN(num_classes=NUM_CLASSES).to(device) - + # Wrap model with DDP for distributed training if world_size > 1: # Best practice: Explicitly specify device_ids for DDP @@ -231,53 +241,57 @@ def main(): device_ids=[local_rank], output_device=local_rank, broadcast_buffers=True, # Ensure buffers (like BatchNorm stats) are synced - find_unused_parameters=False # Set True only if needed (performance impact) + find_unused_parameters=False, # Set True only if needed (performance impact) ) print(f"[Rank {rank}] ✓ Model wrapped with DistributedDataParallel") - + # Create optimizer and loss function optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9) criterion = nn.CrossEntropyLoss() - + # Synchronize before training if world_size > 1: # Best practice: Specify device to avoid warnings dist.barrier(device_ids=[local_rank]) - + # Get topology information early (needed for logging) local_world_size = int(os.environ.get("LOCAL_WORLD_SIZE", 1)) node_rank = rank // local_world_size if local_world_size > 0 else 0 - + if local_rank == 0: print(f"\n{'='*70}") print(f"[Node {node_rank}] Starting Training") print(f"{'='*70}") - + # Training loop all_metrics = [] for epoch in range(NUM_EPOCHS): - metrics = train_epoch( - model, optimizer, criterion, epoch, device - ) + metrics = train_epoch(model, optimizer, criterion, epoch, device) all_metrics.append(metrics) - + if local_rank == 0: print(f"\n[Node {node_rank}] Epoch [{epoch+1}/{NUM_EPOCHS}] Complete:") print(f" Average Loss: {metrics['avg_loss']:.4f}") print(f" Node Throughput: {metrics['node_throughput']:.2f} samples/sec") print(f" Local GPUs: {metrics['local_world_size']}") - + # Calculate average node throughput across all epochs - avg_node_throughput = sum(m['node_throughput'] for m in all_metrics) / len(all_metrics) - avg_epoch_time = sum(m['epoch_time'] for m in all_metrics) / len(all_metrics) - + avg_node_throughput = sum(m["node_throughput"] for m in all_metrics) / len( + all_metrics + ) + avg_epoch_time = sum(m["epoch_time"] for m in all_metrics) / len(all_metrics) + # Calculate num_nodes for reference - num_nodes = (world_size + local_world_size - 1) // local_world_size if local_world_size > 0 else 1 - + num_nodes = ( + (world_size + local_world_size - 1) // local_world_size + if local_world_size > 0 + else 1 + ) + # Synchronize before final output if world_size > 1: dist.barrier(device_ids=[local_rank]) - + # ======================================================================== # Node-Local Performance Reporting (NEW - Best Practice) # Each node reports its OWN performance @@ -293,24 +307,23 @@ def main(): print(f"Node Throughput: {avg_node_throughput:.2f} samples_per_second") print(f"Avg Time per Epoch: {avg_epoch_time:.2f}s") print(f"{'='*70}") - + # CRITICAL: Standard output format for madengine parsing print(f"performance: {avg_node_throughput:.2f} samples_per_second", flush=True) print(f"node_id: {node_rank}", flush=True) print(f"local_gpus: {local_world_size}", flush=True) - + # Calculate and print test duration test_duration = time.time() - test_start_time print(f"test_duration: {test_duration:.2f}s", flush=True) sys.stdout.flush() - # Cleanup if world_size > 1: dist.destroy_process_group() if rank == 0: print(f"✓ Process group destroyed") - + return 0 @@ -320,5 +333,6 @@ def main(): except Exception as e: print(f"[Rank {rank}] ✗ Error: {e}", file=sys.stderr) import traceback + traceback.print_exc() sys.exit(1) diff --git a/tests/fixtures/dummy/scripts/dummy_torchrun/run_torchrun_data_minio.py b/tests/fixtures/dummy/scripts/dummy_torchrun/run_torchrun_data_minio.py index 26c9c236..8dbcd314 100755 --- a/tests/fixtures/dummy/scripts/dummy_torchrun/run_torchrun_data_minio.py +++ b/tests/fixtures/dummy/scripts/dummy_torchrun/run_torchrun_data_minio.py @@ -22,14 +22,15 @@ """ import os +import pathlib +import socket import sys import time -import socket -import pathlib + import torch +import torch.distributed as dist import torch.nn as nn import torch.nn.functional as F -import torch.distributed as dist from torch.nn.parallel import DistributedDataParallel as DDP # Configuration @@ -75,23 +76,23 @@ def print_header(): def validate_data_availability(): """ Validate that required data is available (K8s best practice). - + Strategy: 1. Rank 0 checks data first and reports status 2. All ranks independently validate data (no barrier needed before init_process_group) 3. Exit gracefully if data missing - + Note: For K8s deployments, MAD_DATAHOME points to PVC mount point (/data). This ensures data is shared across all pods (single-node and multi-node). PVC must be configured with ReadWriteMany for multi-node deployments. - + Returns: bool: True if data is available, False otherwise """ # K8s best practice: Data stored in PVC at /data (separate from compute pods) data_home = os.environ.get("MAD_DATAHOME", "/data") data_path = pathlib.Path(data_home) / DATA_FILE - + if rank == 0: print(f"\n{'='*70}") print("Data Provider Validation") @@ -99,7 +100,7 @@ def validate_data_availability(): print(f"Data Home: {data_home}") print(f"Expected File: {DATA_FILE}") print(f"Full Path: {data_path}") - + if data_path.exists(): file_size = data_path.stat().st_size file_size_mb = file_size / (1024 * 1024) @@ -113,39 +114,40 @@ def validate_data_availability(): print(f"\n⚠️ Data provider should have downloaded this file.") print(f" Check data provider configuration and logs.") print(f"{'='*70}\n") - + # Note: Cannot use dist.barrier() here - process group not initialized yet # Data validation happens before distributed initialization # All ranks will independently validate data availability without synchronization - + # All ranks independently validate data exists data_available = data_path.exists() - + if not data_available: print(f"[Rank {rank}] ❌ ERROR: Data file not found at {data_path}") else: print(f"[Rank {rank}] ✅ Data file validated: {data_path}") - + return data_available class SimpleCNN(nn.Module): """Simple CNN model for benchmarking""" + def __init__(self, num_classes=1000): super(SimpleCNN, self).__init__() self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3) self.bn1 = nn.BatchNorm2d(64) self.pool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) - + self.conv2 = nn.Conv2d(64, 128, kernel_size=3, padding=1) self.bn2 = nn.BatchNorm2d(128) - + self.conv3 = nn.Conv2d(128, 256, kernel_size=3, padding=1) self.bn3 = nn.BatchNorm2d(256) - + self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) self.fc = nn.Linear(256, num_classes) - + def forward(self, x): x = self.pool(F.relu(self.bn1(self.conv1(x)))) x = self.pool(F.relu(self.bn2(self.conv2(x)))) @@ -169,106 +171,114 @@ def train_epoch(model, optimizer, criterion, epoch, device): epoch_start = time.time() total_samples = 0 total_loss = 0.0 - + for batch_idx in range(NUM_BATCHES): batch_start = time.time() - + # Generate synthetic data images, labels = generate_synthetic_batch(BATCH_SIZE, device) - + # Forward pass optimizer.zero_grad() outputs = model(images) loss = criterion(outputs, labels) - + # Backward pass (gradients are automatically synchronized across GPUs) loss.backward() - + # Update weights optimizer.step() - + batch_time = time.time() - batch_start total_samples += BATCH_SIZE total_loss += loss.item() - + # Print progress from rank 0 if rank == 0 and (batch_idx + 1) % 20 == 0: avg_loss = total_loss / (batch_idx + 1) throughput = BATCH_SIZE * world_size / batch_time - print(f"Epoch [{epoch+1}/{NUM_EPOCHS}] " - f"Batch [{batch_idx+1}/{NUM_BATCHES}] " - f"Loss: {loss.item():.4f} " - f"Throughput: {throughput:.2f} samples/sec") - + print( + f"Epoch [{epoch+1}/{NUM_EPOCHS}] " + f"Batch [{batch_idx+1}/{NUM_BATCHES}] " + f"Loss: {loss.item():.4f} " + f"Throughput: {throughput:.2f} samples/sec" + ) + epoch_time = time.time() - epoch_start avg_loss = total_loss / NUM_BATCHES - + # ======================================================================== # Accurate Distributed Throughput Measurement (Best Practice) # ======================================================================== # Calculate local throughput for this rank local_samples = NUM_BATCHES * BATCH_SIZE local_throughput = local_samples / epoch_time - + # Aggregate metrics across all ranks using all_reduce if world_size > 1: # Convert to tensors for all_reduce local_throughput_tensor = torch.tensor([local_throughput], device=device) epoch_time_tensor = torch.tensor([epoch_time], device=device) - + # Sum all local throughputs to get true global throughput global_throughput_tensor = local_throughput_tensor.clone() dist.all_reduce(global_throughput_tensor, op=dist.ReduceOp.SUM) - + # Get max epoch time (slowest node determines overall speed) max_epoch_time_tensor = epoch_time_tensor.clone() dist.all_reduce(max_epoch_time_tensor, op=dist.ReduceOp.MAX) - + # Get min epoch time (fastest node) min_epoch_time_tensor = epoch_time_tensor.clone() dist.all_reduce(min_epoch_time_tensor, op=dist.ReduceOp.MIN) - + global_throughput = global_throughput_tensor.item() max_epoch_time = max_epoch_time_tensor.item() min_epoch_time = min_epoch_time_tensor.item() - + # Calculate load imbalance - time_imbalance = ((max_epoch_time - min_epoch_time) / max_epoch_time) * 100 if max_epoch_time > 0 else 0.0 - + time_imbalance = ( + ((max_epoch_time - min_epoch_time) / max_epoch_time) * 100 + if max_epoch_time > 0 + else 0.0 + ) + else: # Single GPU global_throughput = local_throughput max_epoch_time = epoch_time min_epoch_time = epoch_time time_imbalance = 0.0 - + # Return metrics dictionary metrics = { - 'avg_loss': avg_loss, - 'local_throughput': local_throughput, - 'global_throughput': global_throughput, - 'epoch_time': epoch_time, - 'max_epoch_time': max_epoch_time, - 'min_epoch_time': min_epoch_time, - 'time_imbalance': time_imbalance + "avg_loss": avg_loss, + "local_throughput": local_throughput, + "global_throughput": global_throughput, + "epoch_time": epoch_time, + "max_epoch_time": max_epoch_time, + "min_epoch_time": min_epoch_time, + "time_imbalance": time_imbalance, } - + return metrics def main(): """Main training function""" print_header() - + # Create per-process MIOpen cache directory to avoid database conflicts # This must be done AFTER torchrun sets LOCAL_RANK environment variable if "MIOPEN_USER_DB_PATH" in os.environ: # Construct the per-process MIOpen path using actual local_rank value miopen_template = os.environ["MIOPEN_USER_DB_PATH"] - miopen_path = miopen_template.replace("${LOCAL_RANK:-0}", str(local_rank)).replace("$LOCAL_RANK", str(local_rank)) + miopen_path = miopen_template.replace( + "${LOCAL_RANK:-0}", str(local_rank) + ).replace("$LOCAL_RANK", str(local_rank)) os.makedirs(miopen_path, exist_ok=True) print(f"[Rank {rank}] ✓ Created MIOpen cache directory: {miopen_path}") - + # ======================================================================== # K8s Best Practice: Validate Data Before Initializing Training # ======================================================================== @@ -276,10 +286,10 @@ def main(): print(f"\n{'='*70}") print("Step 1: Data Provider Validation") print(f"{'='*70}") - + # Validate data availability (all ranks) data_available = validate_data_availability() - + if not data_available: # Exit gracefully if data is not available if rank == 0: @@ -288,10 +298,10 @@ def main(): print(f"{'='*70}") print("Exiting...") sys.exit(1) - + if rank == 0: print(f"\n✅ Data validation complete - proceeding with training\n") - + # ======================================================================== # Initialize Distributed Training # ======================================================================== @@ -300,44 +310,48 @@ def main(): print(f"{'='*70}") print("Step 2: Initialize Distributed Training") print(f"{'='*70}") - + print(f"\n[Rank {rank}] Initializing distributed process group...") # Best practice: Specify device_ids to avoid PyTorch warnings dist.init_process_group( backend="nccl", init_method=f"env://", # Use environment variables (set by torchrun) world_size=world_size, - rank=rank + rank=rank, ) print(f"[Rank {rank}] ✓ Process group initialized") print(f"[Rank {rank}] Backend: {dist.get_backend()}") print(f"[Rank {rank}] World Size: {dist.get_world_size()}") else: print(f"\n=== Running in Standalone Mode (Single GPU) ===") - + # Set device if torch.cuda.is_available(): num_gpus = torch.cuda.device_count() print(f"[Rank {rank}] PyTorch sees {num_gpus} GPU(s)") - print(f"[Rank {rank}] LOCAL_RANK={local_rank}, attempting to use cuda:{local_rank}") - + print( + f"[Rank {rank}] LOCAL_RANK={local_rank}, attempting to use cuda:{local_rank}" + ) + if local_rank >= num_gpus: - print(f"[Rank {rank}] ERROR: LOCAL_RANK {local_rank} >= available GPUs {num_gpus}") + print( + f"[Rank {rank}] ERROR: LOCAL_RANK {local_rank} >= available GPUs {num_gpus}" + ) print(f"[Rank {rank}] Using cuda:0 instead") device = torch.device("cuda:0") else: device = torch.device(f"cuda:{local_rank}") - + torch.cuda.set_device(device) print(f"[Rank {rank}] Using GPU: {torch.cuda.get_device_name(device)}") else: device = torch.device("cpu") print(f"[Rank {rank}] Warning: CUDA not available, using CPU") - + # Create model print(f"\n[Rank {rank}] Creating model...") model = SimpleCNN(num_classes=NUM_CLASSES).to(device) - + # Wrap model with DDP for distributed training if world_size > 1: # Best practice: Explicitly specify device_ids for DDP @@ -346,89 +360,105 @@ def main(): device_ids=[local_rank], output_device=local_rank, broadcast_buffers=True, # Ensure buffers (like BatchNorm stats) are synced - find_unused_parameters=False # Set True only if needed (performance impact) + find_unused_parameters=False, # Set True only if needed (performance impact) ) print(f"[Rank {rank}] ✓ Model wrapped with DistributedDataParallel") - + # Create optimizer and loss function optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9) criterion = nn.CrossEntropyLoss() - + # Synchronize before training if world_size > 1: # Best practice: Specify device to avoid warnings dist.barrier(device_ids=[local_rank]) - + if rank == 0: print(f"\n{'='*70}") print("Starting Training") print(f"{'='*70}") - + # Training loop all_metrics = [] for epoch in range(NUM_EPOCHS): - metrics = train_epoch( - model, optimizer, criterion, epoch, device - ) + metrics = train_epoch(model, optimizer, criterion, epoch, device) all_metrics.append(metrics) - + if rank == 0: print(f"\nEpoch [{epoch+1}/{NUM_EPOCHS}] Complete:") print(f" Average Loss: {metrics['avg_loss']:.4f}") - print(f" Global Throughput: {metrics['global_throughput']:.2f} samples/sec") + print( + f" Global Throughput: {metrics['global_throughput']:.2f} samples/sec" + ) print(f" Images/sec: {metrics['global_throughput']:.2f}") - + # Show load imbalance warning if significant - if metrics['time_imbalance'] > 5.0: + if metrics["time_imbalance"] > 5.0: print(f" ⚠️ Load Imbalance: {metrics['time_imbalance']:.1f}%") - + # Calculate average metrics across all epochs - avg_global_throughput = sum(m['global_throughput'] for m in all_metrics) / len(all_metrics) - avg_local_throughput = sum(m['local_throughput'] for m in all_metrics) / len(all_metrics) - avg_time_imbalance = sum(m['time_imbalance'] for m in all_metrics) / len(all_metrics) - + avg_global_throughput = sum(m["global_throughput"] for m in all_metrics) / len( + all_metrics + ) + avg_local_throughput = sum(m["local_throughput"] for m in all_metrics) / len( + all_metrics + ) + avg_time_imbalance = sum(m["time_imbalance"] for m in all_metrics) / len( + all_metrics + ) + # Get topology information nproc_per_node = int(os.environ.get("LOCAL_WORLD_SIZE", world_size)) - num_nodes = (world_size + nproc_per_node - 1) // nproc_per_node if nproc_per_node > 0 else 1 + num_nodes = ( + (world_size + nproc_per_node - 1) // nproc_per_node if nproc_per_node > 0 else 1 + ) node_rank = rank // nproc_per_node if nproc_per_node > 0 else 0 - + # Synchronize before final output if world_size > 1: dist.barrier(device_ids=[local_rank]) - + # Each node's rank 0 reports local performance if local_rank == 0: print(f"\n[Node {node_rank}] Local Performance Summary:") - print(f" Node Throughput: {avg_local_throughput * nproc_per_node:.2f} samples/sec") + print( + f" Node Throughput: {avg_local_throughput * nproc_per_node:.2f} samples/sec" + ) print(f" GPUs on Node: {nproc_per_node}") print(f" Avg Time per Epoch: {all_metrics[-1]['epoch_time']:.2f}s") - + # Synchronize again before global rank 0 output if world_size > 1: dist.barrier(device_ids=[local_rank]) - + # Global rank 0 reports aggregated performance if rank == 0: print(f"\n{'='*70}") print("Training Complete - GLOBAL METRICS") print(f"{'='*70}") - print(f"Topology: {num_nodes} nodes × {nproc_per_node} GPUs/node = {world_size} total GPUs") + print( + f"Topology: {num_nodes} nodes × {nproc_per_node} GPUs/node = {world_size} total GPUs" + ) print(f"Global Throughput: {avg_global_throughput:.2f} samples/sec") print(f"Per-GPU Throughput: {avg_global_throughput/world_size:.2f} samples/sec") print(f"Global Batch Size: {BATCH_SIZE * world_size}") - + # Calculate scaling efficiency # Ideal throughput = single GPU throughput * number of GPUs ideal_single_gpu_throughput = avg_global_throughput / world_size ideal_throughput = ideal_single_gpu_throughput * world_size - scaling_efficiency = (avg_global_throughput / ideal_throughput) * 100 if ideal_throughput > 0 else 100.0 + scaling_efficiency = ( + (avg_global_throughput / ideal_throughput) * 100 + if ideal_throughput > 0 + else 100.0 + ) print(f"Scaling Efficiency: {scaling_efficiency:.1f}%") - + if avg_time_imbalance > 5.0: print(f"Average Load Imbalance: {avg_time_imbalance:.1f}%") - + print(f"{'='*70}") - + # Save results with topology information with open("training_results.txt", "w") as f: f.write(f"Training Results with Data Provider\n") @@ -441,21 +471,23 @@ def main(): f.write(f"Epochs: {NUM_EPOCHS}\n") f.write(f"Global Throughput: {avg_global_throughput:.2f} samples/sec\n") f.write(f"Scaling Efficiency: {scaling_efficiency:.1f}%\n") - + # Output performance metric for madengine (REQUIRED FORMAT) # Use GLOBAL throughput (sum of all nodes - accurate measurement) print(f"\nperformance: {avg_global_throughput:.2f} samples_per_second") - + # Output topology metadata for parsing - print(f"topology: {num_nodes} nodes {nproc_per_node} gpus_per_node {world_size} total_gpus") + print( + f"topology: {num_nodes} nodes {nproc_per_node} gpus_per_node {world_size} total_gpus" + ) print(f"scaling_efficiency: {scaling_efficiency:.2f}") - + # Cleanup if world_size > 1: dist.destroy_process_group() if rank == 0: print(f"✓ Process group destroyed") - + return 0 @@ -465,6 +497,6 @@ def main(): except Exception as e: print(f"[Rank {rank}] ✗ Error: {e}", file=sys.stderr) import traceback + traceback.print_exc() sys.exit(1) - diff --git a/tests/fixtures/dummy/scripts/dummy_torchrun/run_torchrun_data_nas.py b/tests/fixtures/dummy/scripts/dummy_torchrun/run_torchrun_data_nas.py index 5599981f..9bb44d14 100755 --- a/tests/fixtures/dummy/scripts/dummy_torchrun/run_torchrun_data_nas.py +++ b/tests/fixtures/dummy/scripts/dummy_torchrun/run_torchrun_data_nas.py @@ -22,14 +22,15 @@ """ import os +import pathlib +import socket import sys import time -import socket -import pathlib + import torch +import torch.distributed as dist import torch.nn as nn import torch.nn.functional as F -import torch.distributed as dist from torch.nn.parallel import DistributedDataParallel as DDP # Configuration @@ -76,37 +77,37 @@ def print_header(): def validate_data_availability(): """ Validate that required data is available from NAS (K8s best practice). - + Strategy: 1. Rank 0 checks data first and reports status 2. All ranks independently validate data (no barrier needed before init_process_group) 3. Exit gracefully if data missing - + Note: For K8s deployments, MAD_DATAHOME points to PVC mount point (/data). This ensures data is shared across all pods (single-node and multi-node). PVC must be configured with ReadWriteMany for multi-node deployments. - + NAS can be either: - Mounted filesystem (traditional NAS) - Downloaded data to directory (K8s with data provider) - + Similar to run_data_nas.sh: We just verify the data home directory exists and optionally has content. No specific file is required - we use synthetic data for training benchmarks. - + Returns: bool: True if data is available, False otherwise """ # K8s best practice: Data stored in PVC at /data (separate from compute pods) data_home = os.environ.get("MAD_DATAHOME", "/data") data_home_path = pathlib.Path(data_home) - + if rank == 0: print(f"\n{'='*70}") print("NAS Data Provider Validation") print(f"{'='*70}") print(f"Data Home: {data_home}") - + # Check if data directory exists if not data_home_path.exists(): print(f"❌ Data home directory NOT found!") @@ -114,7 +115,7 @@ def validate_data_availability(): print(f" MAD_DATAHOME must be set and directory must exist") else: print(f"✅ Data home directory exists: {data_home}") - + # Check if directory has content (similar to run_data_nas.sh) try: dir_contents = list(data_home_path.iterdir()) @@ -138,42 +139,43 @@ def validate_data_availability(): except PermissionError: print(f"⚠️ Cannot read directory contents (permission denied)") print(f" Directory exists but contents not accessible") - + print(f"{'='*70}\n") - + # Note: Cannot use dist.barrier() here - process group not initialized yet # Data validation happens before distributed initialization # All ranks will independently validate data availability without synchronization - + # All ranks independently validate data home exists # We don't require a specific file - just that the directory exists data_available = data_home_path.exists() - + if not data_available: print(f"[Rank {rank}] ❌ ERROR: Data home not found at {data_home}") else: print(f"[Rank {rank}] ✅ Data home validated: {data_home}") - + return data_available class SimpleCNN(nn.Module): """Simple CNN model for benchmarking""" + def __init__(self, num_classes=1000): super(SimpleCNN, self).__init__() self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3) self.bn1 = nn.BatchNorm2d(64) self.pool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) - + self.conv2 = nn.Conv2d(64, 128, kernel_size=3, padding=1) self.bn2 = nn.BatchNorm2d(128) - + self.conv3 = nn.Conv2d(128, 256, kernel_size=3, padding=1) self.bn3 = nn.BatchNorm2d(256) - + self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) self.fc = nn.Linear(256, num_classes) - + def forward(self, x): x = self.pool(F.relu(self.bn1(self.conv1(x)))) x = self.pool(F.relu(self.bn2(self.conv2(x)))) @@ -197,106 +199,114 @@ def train_epoch(model, optimizer, criterion, epoch, device): epoch_start = time.time() total_samples = 0 total_loss = 0.0 - + for batch_idx in range(NUM_BATCHES): batch_start = time.time() - + # Generate synthetic data images, labels = generate_synthetic_batch(BATCH_SIZE, device) - + # Forward pass optimizer.zero_grad() outputs = model(images) loss = criterion(outputs, labels) - + # Backward pass (gradients are automatically synchronized across GPUs) loss.backward() - + # Update weights optimizer.step() - + batch_time = time.time() - batch_start total_samples += BATCH_SIZE total_loss += loss.item() - + # Print progress from rank 0 if rank == 0 and (batch_idx + 1) % 20 == 0: avg_loss = total_loss / (batch_idx + 1) throughput = BATCH_SIZE * world_size / batch_time - print(f"Epoch [{epoch+1}/{NUM_EPOCHS}] " - f"Batch [{batch_idx+1}/{NUM_BATCHES}] " - f"Loss: {loss.item():.4f} " - f"Throughput: {throughput:.2f} samples/sec") - + print( + f"Epoch [{epoch+1}/{NUM_EPOCHS}] " + f"Batch [{batch_idx+1}/{NUM_BATCHES}] " + f"Loss: {loss.item():.4f} " + f"Throughput: {throughput:.2f} samples/sec" + ) + epoch_time = time.time() - epoch_start avg_loss = total_loss / NUM_BATCHES - + # ======================================================================== # Accurate Distributed Throughput Measurement (Best Practice) # ======================================================================== # Calculate local throughput for this rank local_samples = NUM_BATCHES * BATCH_SIZE local_throughput = local_samples / epoch_time - + # Aggregate metrics across all ranks using all_reduce if world_size > 1: # Convert to tensors for all_reduce local_throughput_tensor = torch.tensor([local_throughput], device=device) epoch_time_tensor = torch.tensor([epoch_time], device=device) - + # Sum all local throughputs to get true global throughput global_throughput_tensor = local_throughput_tensor.clone() dist.all_reduce(global_throughput_tensor, op=dist.ReduceOp.SUM) - + # Get max epoch time (slowest node determines overall speed) max_epoch_time_tensor = epoch_time_tensor.clone() dist.all_reduce(max_epoch_time_tensor, op=dist.ReduceOp.MAX) - + # Get min epoch time (fastest node) min_epoch_time_tensor = epoch_time_tensor.clone() dist.all_reduce(min_epoch_time_tensor, op=dist.ReduceOp.MIN) - + global_throughput = global_throughput_tensor.item() max_epoch_time = max_epoch_time_tensor.item() min_epoch_time = min_epoch_time_tensor.item() - + # Calculate load imbalance - time_imbalance = ((max_epoch_time - min_epoch_time) / max_epoch_time) * 100 if max_epoch_time > 0 else 0.0 - + time_imbalance = ( + ((max_epoch_time - min_epoch_time) / max_epoch_time) * 100 + if max_epoch_time > 0 + else 0.0 + ) + else: # Single GPU global_throughput = local_throughput max_epoch_time = epoch_time min_epoch_time = epoch_time time_imbalance = 0.0 - + # Return metrics dictionary metrics = { - 'avg_loss': avg_loss, - 'local_throughput': local_throughput, - 'global_throughput': global_throughput, - 'epoch_time': epoch_time, - 'max_epoch_time': max_epoch_time, - 'min_epoch_time': min_epoch_time, - 'time_imbalance': time_imbalance + "avg_loss": avg_loss, + "local_throughput": local_throughput, + "global_throughput": global_throughput, + "epoch_time": epoch_time, + "max_epoch_time": max_epoch_time, + "min_epoch_time": min_epoch_time, + "time_imbalance": time_imbalance, } - + return metrics def main(): """Main training function""" print_header() - + # Create per-process MIOpen cache directory to avoid database conflicts # This must be done AFTER torchrun sets LOCAL_RANK environment variable if "MIOPEN_USER_DB_PATH" in os.environ: # Construct the per-process MIOpen path using actual local_rank value miopen_template = os.environ["MIOPEN_USER_DB_PATH"] - miopen_path = miopen_template.replace("${LOCAL_RANK:-0}", str(local_rank)).replace("$LOCAL_RANK", str(local_rank)) + miopen_path = miopen_template.replace( + "${LOCAL_RANK:-0}", str(local_rank) + ).replace("$LOCAL_RANK", str(local_rank)) os.makedirs(miopen_path, exist_ok=True) print(f"[Rank {rank}] ✓ Created MIOpen cache directory: {miopen_path}") - + # ======================================================================== # K8s Best Practice: Validate Data Before Initializing Training # ======================================================================== @@ -304,10 +314,10 @@ def main(): print(f"\n{'='*70}") print("Step 1: NAS Data Provider Validation") print(f"{'='*70}") - + # Validate data availability (all ranks) data_available = validate_data_availability() - + if not data_available: # Exit gracefully if data is not available if rank == 0: @@ -316,10 +326,10 @@ def main(): print(f"{'='*70}") print("Exiting...") sys.exit(1) - + if rank == 0: print(f"\n✅ Data validation complete - proceeding with training\n") - + # ======================================================================== # Initialize Distributed Training # ======================================================================== @@ -328,44 +338,48 @@ def main(): print(f"{'='*70}") print("Step 2: Initialize Distributed Training") print(f"{'='*70}") - + print(f"\n[Rank {rank}] Initializing distributed process group...") # Best practice: Specify device_ids to avoid PyTorch warnings dist.init_process_group( backend="nccl", init_method=f"env://", # Use environment variables (set by torchrun) world_size=world_size, - rank=rank + rank=rank, ) print(f"[Rank {rank}] ✓ Process group initialized") print(f"[Rank {rank}] Backend: {dist.get_backend()}") print(f"[Rank {rank}] World Size: {dist.get_world_size()}") else: print(f"\n=== Running in Standalone Mode (Single GPU) ===") - + # Set device if torch.cuda.is_available(): num_gpus = torch.cuda.device_count() print(f"[Rank {rank}] PyTorch sees {num_gpus} GPU(s)") - print(f"[Rank {rank}] LOCAL_RANK={local_rank}, attempting to use cuda:{local_rank}") - + print( + f"[Rank {rank}] LOCAL_RANK={local_rank}, attempting to use cuda:{local_rank}" + ) + if local_rank >= num_gpus: - print(f"[Rank {rank}] ERROR: LOCAL_RANK {local_rank} >= available GPUs {num_gpus}") + print( + f"[Rank {rank}] ERROR: LOCAL_RANK {local_rank} >= available GPUs {num_gpus}" + ) print(f"[Rank {rank}] Using cuda:0 instead") device = torch.device("cuda:0") else: device = torch.device(f"cuda:{local_rank}") - + torch.cuda.set_device(device) print(f"[Rank {rank}] Using GPU: {torch.cuda.get_device_name(device)}") else: device = torch.device("cpu") print(f"[Rank {rank}] Warning: CUDA not available, using CPU") - + # Create model print(f"\n[Rank {rank}] Creating model...") model = SimpleCNN(num_classes=NUM_CLASSES).to(device) - + # Wrap model with DDP for distributed training if world_size > 1: # Best practice: Explicitly specify device_ids for DDP @@ -374,89 +388,105 @@ def main(): device_ids=[local_rank], output_device=local_rank, broadcast_buffers=True, # Ensure buffers (like BatchNorm stats) are synced - find_unused_parameters=False # Set True only if needed (performance impact) + find_unused_parameters=False, # Set True only if needed (performance impact) ) print(f"[Rank {rank}] ✓ Model wrapped with DistributedDataParallel") - + # Create optimizer and loss function optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9) criterion = nn.CrossEntropyLoss() - + # Synchronize before training if world_size > 1: # Best practice: Specify device to avoid warnings dist.barrier(device_ids=[local_rank]) - + if rank == 0: print(f"\n{'='*70}") print("Starting Training") print(f"{'='*70}") - + # Training loop all_metrics = [] for epoch in range(NUM_EPOCHS): - metrics = train_epoch( - model, optimizer, criterion, epoch, device - ) + metrics = train_epoch(model, optimizer, criterion, epoch, device) all_metrics.append(metrics) - + if rank == 0: print(f"\nEpoch [{epoch+1}/{NUM_EPOCHS}] Complete:") print(f" Average Loss: {metrics['avg_loss']:.4f}") - print(f" Global Throughput: {metrics['global_throughput']:.2f} samples/sec") + print( + f" Global Throughput: {metrics['global_throughput']:.2f} samples/sec" + ) print(f" Images/sec: {metrics['global_throughput']:.2f}") - + # Show load imbalance warning if significant - if metrics['time_imbalance'] > 5.0: + if metrics["time_imbalance"] > 5.0: print(f" ⚠️ Load Imbalance: {metrics['time_imbalance']:.1f}%") - + # Calculate average metrics across all epochs - avg_global_throughput = sum(m['global_throughput'] for m in all_metrics) / len(all_metrics) - avg_local_throughput = sum(m['local_throughput'] for m in all_metrics) / len(all_metrics) - avg_time_imbalance = sum(m['time_imbalance'] for m in all_metrics) / len(all_metrics) - + avg_global_throughput = sum(m["global_throughput"] for m in all_metrics) / len( + all_metrics + ) + avg_local_throughput = sum(m["local_throughput"] for m in all_metrics) / len( + all_metrics + ) + avg_time_imbalance = sum(m["time_imbalance"] for m in all_metrics) / len( + all_metrics + ) + # Get topology information nproc_per_node = int(os.environ.get("LOCAL_WORLD_SIZE", world_size)) - num_nodes = (world_size + nproc_per_node - 1) // nproc_per_node if nproc_per_node > 0 else 1 + num_nodes = ( + (world_size + nproc_per_node - 1) // nproc_per_node if nproc_per_node > 0 else 1 + ) node_rank = rank // nproc_per_node if nproc_per_node > 0 else 0 - + # Synchronize before final output if world_size > 1: dist.barrier(device_ids=[local_rank]) - + # Each node's rank 0 reports local performance if local_rank == 0: print(f"\n[Node {node_rank}] Local Performance Summary:") - print(f" Node Throughput: {avg_local_throughput * nproc_per_node:.2f} samples/sec") + print( + f" Node Throughput: {avg_local_throughput * nproc_per_node:.2f} samples/sec" + ) print(f" GPUs on Node: {nproc_per_node}") print(f" Avg Time per Epoch: {all_metrics[-1]['epoch_time']:.2f}s") - + # Synchronize again before global rank 0 output if world_size > 1: dist.barrier(device_ids=[local_rank]) - + # Global rank 0 reports aggregated performance if rank == 0: print(f"\n{'='*70}") print("Training Complete - GLOBAL METRICS") print(f"{'='*70}") - print(f"Topology: {num_nodes} nodes × {nproc_per_node} GPUs/node = {world_size} total GPUs") + print( + f"Topology: {num_nodes} nodes × {nproc_per_node} GPUs/node = {world_size} total GPUs" + ) print(f"Global Throughput: {avg_global_throughput:.2f} samples/sec") print(f"Per-GPU Throughput: {avg_global_throughput/world_size:.2f} samples/sec") print(f"Global Batch Size: {BATCH_SIZE * world_size}") - + # Calculate scaling efficiency # Ideal throughput = single GPU throughput * number of GPUs ideal_single_gpu_throughput = avg_global_throughput / world_size ideal_throughput = ideal_single_gpu_throughput * world_size - scaling_efficiency = (avg_global_throughput / ideal_throughput) * 100 if ideal_throughput > 0 else 100.0 + scaling_efficiency = ( + (avg_global_throughput / ideal_throughput) * 100 + if ideal_throughput > 0 + else 100.0 + ) print(f"Scaling Efficiency: {scaling_efficiency:.1f}%") - + if avg_time_imbalance > 5.0: print(f"Average Load Imbalance: {avg_time_imbalance:.1f}%") - + print(f"{'='*70}") - + # Save results with topology information data_home = os.environ.get("MAD_DATAHOME", "/data") with open("training_results.txt", "w") as f: @@ -470,21 +500,23 @@ def main(): f.write(f"Epochs: {NUM_EPOCHS}\n") f.write(f"Global Throughput: {avg_global_throughput:.2f} samples/sec\n") f.write(f"Scaling Efficiency: {scaling_efficiency:.1f}%\n") - + # Output performance metric for madengine (REQUIRED FORMAT) # Use GLOBAL throughput (sum of all nodes - accurate measurement) print(f"\nperformance: {avg_global_throughput:.2f} samples_per_second") - + # Output topology metadata for parsing - print(f"topology: {num_nodes} nodes {nproc_per_node} gpus_per_node {world_size} total_gpus") + print( + f"topology: {num_nodes} nodes {nproc_per_node} gpus_per_node {world_size} total_gpus" + ) print(f"scaling_efficiency: {scaling_efficiency:.2f}") - + # Cleanup if world_size > 1: dist.destroy_process_group() if rank == 0: print(f"✓ Process group destroyed") - + return 0 @@ -494,6 +526,6 @@ def main(): except Exception as e: print(f"[Rank {rank}] ✗ Error: {e}", file=sys.stderr) import traceback + traceback.print_exc() sys.exit(1) - diff --git a/tests/fixtures/dummy/scripts/dummy_torchrun/run_torchrun_multi.py b/tests/fixtures/dummy/scripts/dummy_torchrun/run_torchrun_multi.py index dc09b9cf..2ad1e0c9 100644 --- a/tests/fixtures/dummy/scripts/dummy_torchrun/run_torchrun_multi.py +++ b/tests/fixtures/dummy/scripts/dummy_torchrun/run_torchrun_multi.py @@ -11,15 +11,16 @@ torchrun --standalone --nproc_per_node=8 run_torchrun_multi.py """ +import csv import os +import socket import sys import time -import socket -import csv + import torch +import torch.distributed as dist import torch.nn as nn import torch.nn.functional as F -import torch.distributed as dist from torch.nn.parallel import DistributedDataParallel as DDP # Configuration @@ -60,21 +61,22 @@ def print_header(): class SimpleCNN(nn.Module): """Simple CNN model for benchmarking""" + def __init__(self, num_classes=1000): super(SimpleCNN, self).__init__() self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3) self.bn1 = nn.BatchNorm2d(64) self.pool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) - + self.conv2 = nn.Conv2d(64, 128, kernel_size=3, padding=1) self.bn2 = nn.BatchNorm2d(128) - + self.conv3 = nn.Conv2d(128, 256, kernel_size=3, padding=1) self.bn3 = nn.BatchNorm2d(256) - + self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) self.fc = nn.Linear(256, num_classes) - + def forward(self, x): x = self.pool(F.relu(self.bn1(self.conv1(x)))) x = self.pool(F.relu(self.bn2(self.conv2(x)))) @@ -98,40 +100,42 @@ def train_epoch(model, optimizer, criterion, epoch, device): epoch_start = time.time() total_samples = 0 total_loss = 0.0 - + for batch_idx in range(NUM_BATCHES): batch_start = time.time() - + # Generate synthetic data images, labels = generate_synthetic_batch(BATCH_SIZE, device) - + # Forward pass optimizer.zero_grad() outputs = model(images) loss = criterion(outputs, labels) - + # Backward pass (gradients are automatically synchronized across GPUs) loss.backward() - + # Update weights optimizer.step() - + batch_time = time.time() - batch_start total_samples += BATCH_SIZE total_loss += loss.item() - + # Print progress from local rank 0 on each node if local_rank == 0 and (batch_idx + 1) % 20 == 0: avg_loss = total_loss / (batch_idx + 1) throughput = BATCH_SIZE / batch_time # Local throughput - print(f"Epoch [{epoch+1}/{NUM_EPOCHS}] " - f"Batch [{batch_idx+1}/{NUM_BATCHES}] " - f"Loss: {loss.item():.4f} " - f"Throughput: {throughput:.2f} samples/sec (local)") - + print( + f"Epoch [{epoch+1}/{NUM_EPOCHS}] " + f"Batch [{batch_idx+1}/{NUM_BATCHES}] " + f"Loss: {loss.item():.4f} " + f"Throughput: {throughput:.2f} samples/sec (local)" + ) + epoch_time = time.time() - epoch_start avg_loss = total_loss / NUM_BATCHES - + # ======================================================================== # Node-Local Throughput Measurement # ======================================================================== @@ -139,112 +143,115 @@ def train_epoch(model, optimizer, criterion, epoch, device): local_gpu_throughput = local_samples / epoch_time local_world_size = int(os.environ.get("LOCAL_WORLD_SIZE", 1)) node_throughput = local_gpu_throughput * local_world_size - + metrics = { - 'avg_loss': avg_loss, - 'node_throughput': node_throughput, - 'epoch_time': epoch_time, - 'local_world_size': local_world_size + "avg_loss": avg_loss, + "node_throughput": node_throughput, + "epoch_time": epoch_time, + "local_world_size": local_world_size, } - + return metrics def main(): """Main training function""" test_start_time = time.time() - + print_header() - + # Create per-process MIOpen cache directory to avoid database conflicts if "MIOPEN_USER_DB_PATH" in os.environ: miopen_template = os.environ["MIOPEN_USER_DB_PATH"] - miopen_path = miopen_template.replace("${LOCAL_RANK:-0}", str(local_rank)).replace("$LOCAL_RANK", str(local_rank)) + miopen_path = miopen_template.replace( + "${LOCAL_RANK:-0}", str(local_rank) + ).replace("$LOCAL_RANK", str(local_rank)) os.makedirs(miopen_path, exist_ok=True) print(f"[Rank {rank}] ✓ Created MIOpen cache directory: {miopen_path}") - + # Initialize distributed training if world_size > 1: print(f"\n[Rank {rank}] Initializing distributed process group...") dist.init_process_group( - backend="nccl", - init_method=f"env://", - world_size=world_size, - rank=rank + backend="nccl", init_method=f"env://", world_size=world_size, rank=rank ) print(f"[Rank {rank}] ✓ Process group initialized") print(f"[Rank {rank}] Backend: {dist.get_backend()}") print(f"[Rank {rank}] World Size: {dist.get_world_size()}") else: print(f"\n=== Running in Standalone Mode (Single GPU) ===") - + # Set device if torch.cuda.is_available(): num_gpus = torch.cuda.device_count() print(f"[Rank {rank}] PyTorch sees {num_gpus} GPU(s)") - print(f"[Rank {rank}] LOCAL_RANK={local_rank}, attempting to use cuda:{local_rank}") - + print( + f"[Rank {rank}] LOCAL_RANK={local_rank}, attempting to use cuda:{local_rank}" + ) + if local_rank >= num_gpus: - print(f"[Rank {rank}] ERROR: LOCAL_RANK {local_rank} >= available GPUs {num_gpus}") + print( + f"[Rank {rank}] ERROR: LOCAL_RANK {local_rank} >= available GPUs {num_gpus}" + ) print(f"[Rank {rank}] Using cuda:0 instead") device = torch.device("cuda:0") else: device = torch.device(f"cuda:{local_rank}") - + torch.cuda.set_device(device) print(f"[Rank {rank}] Using GPU: {torch.cuda.get_device_name(device)}") else: device = torch.device("cpu") print(f"[Rank {rank}] Warning: CUDA not available, using CPU") - + # Create model print(f"\n[Rank {rank}] Creating model...") model = SimpleCNN(num_classes=NUM_CLASSES).to(device) - + if world_size > 1: model = DDP( model, device_ids=[local_rank], output_device=local_rank, broadcast_buffers=True, - find_unused_parameters=False + find_unused_parameters=False, ) print(f"[Rank {rank}] ✓ Model wrapped with DistributedDataParallel") - + optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9) criterion = nn.CrossEntropyLoss() - + if world_size > 1: dist.barrier(device_ids=[local_rank]) - + local_world_size = int(os.environ.get("LOCAL_WORLD_SIZE", 1)) node_rank = rank // local_world_size if local_world_size > 0 else 0 - + if local_rank == 0: print(f"\n{'='*70}") print(f"[Node {node_rank}] Starting Training") print(f"{'='*70}") - + # Training loop all_metrics = [] for epoch in range(NUM_EPOCHS): - metrics = train_epoch( - model, optimizer, criterion, epoch, device - ) + metrics = train_epoch(model, optimizer, criterion, epoch, device) all_metrics.append(metrics) - + if local_rank == 0: print(f"\n[Node {node_rank}] Epoch [{epoch+1}/{NUM_EPOCHS}] Complete:") print(f" Average Loss: {metrics['avg_loss']:.4f}") print(f" Node Throughput: {metrics['node_throughput']:.2f} samples/sec") print(f" Local GPUs: {metrics['local_world_size']}") - - avg_node_throughput = sum(m['node_throughput'] for m in all_metrics) / len(all_metrics) - avg_epoch_time = sum(m['epoch_time'] for m in all_metrics) / len(all_metrics) - + + avg_node_throughput = sum(m["node_throughput"] for m in all_metrics) / len( + all_metrics + ) + avg_epoch_time = sum(m["epoch_time"] for m in all_metrics) / len(all_metrics) + if world_size > 1: dist.barrier(device_ids=[local_rank]) - + # Node-Local Performance Reporting + multiple_results CSV if local_rank == 0: print(f"\n{'='*70}") @@ -256,11 +263,11 @@ def main(): print(f"Node Throughput: {avg_node_throughput:.2f} samples_per_second") print(f"Avg Time per Epoch: {avg_epoch_time:.2f}s") print(f"{'='*70}") - + print(f"performance: {avg_node_throughput:.2f} samples_per_second", flush=True) print(f"node_id: {node_rank}", flush=True) print(f"local_gpus: {local_world_size}", flush=True) - + test_duration = time.time() - test_start_time print(f"test_duration: {test_duration:.2f}s", flush=True) sys.stdout.flush() @@ -268,25 +275,29 @@ def main(): # Write multiple_results CSV (model,temperature,performance,metric,test_duration for duration in reports) with open(MULTI_RESULTS_CSV, "w", newline="") as f: writer = csv.writer(f) - writer.writerow(["model", "temperature", "performance", "metric", "test_duration"]) + writer.writerow( + ["model", "temperature", "performance", "metric", "test_duration"] + ) test_dur_str = f"{test_duration:.2f}s" for i in range(4): # Vary temperature for multiple rows; use throughput for performance - writer.writerow([ - i + 1, - 20 + i * 5, - f"{avg_node_throughput:.2f}", - "samples_per_sec", - test_dur_str, - ]) + writer.writerow( + [ + i + 1, + 20 + i * 5, + f"{avg_node_throughput:.2f}", + "samples_per_sec", + test_dur_str, + ] + ) print(f"Wrote {MULTI_RESULTS_CSV} for multiple_results collection", flush=True) - + # Cleanup if world_size > 1: dist.destroy_process_group() if rank == 0: print(f"✓ Process group destroyed") - + return 0 @@ -296,5 +307,6 @@ def main(): except Exception as e: print(f"[Rank {rank}] ✗ Error: {e}", file=sys.stderr) import traceback + traceback.print_exc() sys.exit(1) diff --git a/tests/fixtures/dummy/scripts/dummy_torchrun/run_with_helper.py b/tests/fixtures/dummy/scripts/dummy_torchrun/run_with_helper.py index 68329eb5..c86142d2 100644 --- a/tests/fixtures/dummy/scripts/dummy_torchrun/run_with_helper.py +++ b/tests/fixtures/dummy/scripts/dummy_torchrun/run_with_helper.py @@ -25,7 +25,7 @@ BenchmarkConfig, print_distributed_info, print_gpu_info, - calculate_model_size + calculate_model_size, ) # Get distributed environment variables (set by torchrun) @@ -51,40 +51,44 @@ def train_epoch(model, dataset, optimizer, criterion, epoch, device, config): model.train() epoch_start = time.time() total_loss = 0.0 - + for batch_idx in range(dataset.num_batches): batch_start = time.time() - + # Generate synthetic data images, labels = dataset.generate_batch(device) - + # Forward pass optimizer.zero_grad() outputs = model(images) loss = criterion(outputs, labels) - + # Backward pass (gradients automatically synchronized) loss.backward() - + # Update weights optimizer.step() - + batch_time = time.time() - batch_start total_loss += loss.item() - + # Print progress from rank 0 if rank == 0 and (batch_idx + 1) % 20 == 0: avg_loss = total_loss / (batch_idx + 1) throughput = config.batch_size * world_size / batch_time - print(f"Epoch [{epoch+1}/{config.num_epochs}] " - f"Batch [{batch_idx+1}/{dataset.num_batches}] " - f"Loss: {loss.item():.4f} " - f"Throughput: {throughput:.2f} samples/sec") - + print( + f"Epoch [{epoch+1}/{config.num_epochs}] " + f"Batch [{batch_idx+1}/{dataset.num_batches}] " + f"Loss: {loss.item():.4f} " + f"Throughput: {throughput:.2f} samples/sec" + ) + epoch_time = time.time() - epoch_start avg_loss = total_loss / dataset.num_batches - epoch_throughput = (dataset.num_batches * config.batch_size * world_size) / epoch_time - + epoch_throughput = ( + dataset.num_batches * config.batch_size * world_size + ) / epoch_time + return avg_loss, epoch_throughput @@ -92,12 +96,12 @@ def main(): """Main training function""" # Load configuration config = BenchmarkConfig() - + print_header(config) - + # Print distributed info print_distributed_info(rank, local_rank, world_size) - + # Initialize distributed training if world_size > 1: print(f"\n[Rank {rank}] Initializing distributed process group...") @@ -106,40 +110,43 @@ def main(): backend="nccl", init_method=f"env://", # Use environment variables (set by torchrun) world_size=world_size, - rank=rank + rank=rank, ) print(f"[Rank {rank}] ✓ Process group initialized") print(f"[Rank {rank}] Backend: {dist.get_backend()}") print(f"[Rank {rank}] World Size: {dist.get_world_size()}") else: print(f"\n=== Running in Standalone Mode (Single GPU) ===") - + # Set device if torch.cuda.is_available(): num_gpus = torch.cuda.device_count() print(f"[Rank {rank}] PyTorch sees {num_gpus} GPU(s)") - print(f"[Rank {rank}] LOCAL_RANK={local_rank}, attempting to use cuda:{local_rank}") - + print( + f"[Rank {rank}] LOCAL_RANK={local_rank}, attempting to use cuda:{local_rank}" + ) + if local_rank >= num_gpus: - print(f"[Rank {rank}] ERROR: LOCAL_RANK {local_rank} >= available GPUs {num_gpus}") + print( + f"[Rank {rank}] ERROR: LOCAL_RANK {local_rank} >= available GPUs {num_gpus}" + ) print(f"[Rank {rank}] Using cuda:0 instead") device = torch.device("cuda:0") else: device = torch.device(f"cuda:{local_rank}") - + torch.cuda.set_device(device) print_gpu_info(rank, device) else: device = torch.device("cpu") print(f"[Rank {rank}] Warning: CUDA not available, using CPU") - + # Create model from helper module print(f"\n[Rank {rank}] Creating ResNet model from helper module...") model = ResNetModel( - num_classes=config.num_classes, - num_blocks=config.resnet_blocks + num_classes=config.num_classes, num_blocks=config.resnet_blocks ).to(device) - + # Print model info if rank == 0: total_params, trainable_params = calculate_model_size(model) @@ -147,7 +154,7 @@ def main(): print(f" Total Parameters: {total_params:,}") print(f" Trainable Parameters: {trainable_params:,}") print(f" Model Size: {total_params * 4 / 1e6:.2f} MB (FP32)") - + # Wrap model with DDP for distributed training if world_size > 1: # Best practice: Explicitly specify device_ids for DDP @@ -156,37 +163,37 @@ def main(): device_ids=[local_rank], output_device=local_rank, broadcast_buffers=True, # Ensure buffers (like BatchNorm stats) are synced - find_unused_parameters=False # Set True only if needed (performance impact) + find_unused_parameters=False, # Set True only if needed (performance impact) ) print(f"[Rank {rank}] ✓ Model wrapped with DistributedDataParallel") - + # Create dataset dataset = SyntheticDataset( num_samples=config.num_batches * config.batch_size, batch_size=config.batch_size, image_size=config.image_size, - num_classes=config.num_classes + num_classes=config.num_classes, ) - + # Create optimizer and loss function optimizer = torch.optim.SGD( model.parameters(), lr=config.learning_rate, momentum=config.momentum, - weight_decay=config.weight_decay + weight_decay=config.weight_decay, ) criterion = nn.CrossEntropyLoss() - + # Synchronize before training if world_size > 1: # Best practice: Specify device to avoid warnings dist.barrier(device_ids=[local_rank]) - + if rank == 0: print(f"\n{'='*70}") print("Starting Training") print(f"{'='*70}") - + # Training loop all_throughputs = [] for epoch in range(config.num_epochs): @@ -194,19 +201,19 @@ def main(): model, dataset, optimizer, criterion, epoch, device, config ) all_throughputs.append(epoch_throughput) - + if rank == 0: print(f"\nEpoch [{epoch+1}/{config.num_epochs}] Complete:") print(f" Average Loss: {avg_loss:.4f}") print(f" Throughput: {epoch_throughput:.2f} samples/sec") - + # Calculate final metrics avg_throughput = sum(all_throughputs) / len(all_throughputs) - + # Synchronize before final output if world_size > 1: dist.barrier(device_ids=[local_rank]) - + if rank == 0: print(f"\n{'='*70}") print("Training Complete") @@ -216,7 +223,7 @@ def main(): print(f"Number of GPUs: {world_size}") print(f"Model: ResNet with {sum(config.resnet_blocks)} blocks") print(f"{'='*70}") - + # Save results with open("training_results_helper.txt", "w") as f: f.write(f"Training Results (with Helper Modules)\n") @@ -227,16 +234,16 @@ def main(): f.write(f"Epochs: {config.num_epochs}\n") f.write(f"Model: ResNet-{sum(config.resnet_blocks)*2+2}\n") f.write(f"Average Throughput: {avg_throughput:.2f} samples/sec\n") - + # Output performance metric for madengine (REQUIRED FORMAT) print(f"\nperformance: {avg_throughput:.2f} samples_per_second") - + # Cleanup if world_size > 1: dist.destroy_process_group() if rank == 0: print(f"✓ Process group destroyed") - + return 0 @@ -246,5 +253,6 @@ def main(): except Exception as e: print(f"[Rank {rank}] ✗ Error: {e}", file=sys.stderr) import traceback + traceback.print_exc() sys.exit(1) diff --git a/tests/fixtures/dummy/scripts/dummy_vllm/run_vllm_inference.py b/tests/fixtures/dummy/scripts/dummy_vllm/run_vllm_inference.py index f4c6a5f3..983709fc 100755 --- a/tests/fixtures/dummy/scripts/dummy_vllm/run_vllm_inference.py +++ b/tests/fixtures/dummy/scripts/dummy_vllm/run_vllm_inference.py @@ -14,11 +14,11 @@ Multi-node: Use Ray backend with proper configuration """ +import argparse import os +import socket import sys import time -import argparse -import socket from typing import List, Optional # Configure environment before importing vLLM @@ -30,8 +30,8 @@ os.environ.setdefault("VLLM_WORKER_MULTIPROC_METHOD", "spawn") try: - from vllm import LLM, SamplingParams import torch + from vllm import LLM, SamplingParams except ImportError as e: print(f"Error importing required libraries: {e}") print("Please ensure vLLM and PyTorch are installed") @@ -60,10 +60,10 @@ def print_header(args): print("vLLM V1 Engine Distributed Inference Benchmark") print("=" * 70) print(f"Hostname: {socket.gethostname()}") - + nnodes = int(os.environ.get("NNODES", "1")) node_rank = int(os.environ.get("NODE_RANK", "0")) - + print(f"Model: {args.model}") print(f"Tensor Parallel Size: {args.tensor_parallel_size}") print(f"Pipeline Parallel Size: {args.pipeline_parallel_size}") @@ -96,19 +96,21 @@ def run_inference(args): print("\n" + "=" * 70) print("Initializing vLLM V1 Engine") print("=" * 70) - + nnodes = int(os.environ.get("NNODES", "1")) node_rank = int(os.environ.get("NODE_RANK", "0")) - + if args.distributed_backend == "auto": distributed_backend = "ray" if nnodes > 1 else None else: - distributed_backend = args.distributed_backend if args.distributed_backend != "none" else None - + distributed_backend = ( + args.distributed_backend if args.distributed_backend != "none" else None + ) + # Use requested TP and PP (multi-node uses TP+PP from madengine env; no forced PP=1) effective_pipeline_size = args.pipeline_parallel_size effective_gpu_memory = 0.60 if args.pipeline_parallel_size > 1 else 0.85 - + if nnodes > 1 and distributed_backend == "ray": print("=" * 70) print("MULTI-NODE TP + PP (single Ray cluster)") @@ -118,9 +120,9 @@ def run_inference(args): print(f"Pipeline Parallel Size: {effective_pipeline_size}") print(f"Total GPUs: {args.tensor_parallel_size * effective_pipeline_size}") print("=" * 70) - + print(f"Using distributed backend: {distributed_backend or 'default'}") - + # Initialize vLLM LLM engine with V1-specific settings try: llm_kwargs = { @@ -133,59 +135,62 @@ def run_inference(args): "max_model_len": 2048, "disable_log_stats": True, # Reduce logging noise } - + # Add distributed backend if specified if distributed_backend: llm_kwargs["distributed_executor_backend"] = distributed_backend - + # V1 engine specific: enforce_eager mode for compatibility if args.enforce_eager: llm_kwargs["enforce_eager"] = True - + llm = LLM(**llm_kwargs) print("✓ vLLM V1 engine initialized successfully") if nnodes > 1: - print(f"✓ Multi-node TP={args.tensor_parallel_size} PP={effective_pipeline_size} (Ray)") + print( + f"✓ Multi-node TP={args.tensor_parallel_size} PP={effective_pipeline_size} (Ray)" + ) except Exception as e: print(f"✗ Failed to initialize vLLM engine: {e}") import traceback + traceback.print_exc() return 1 - + # Configure sampling parameters sampling_params = SamplingParams( temperature=TEMPERATURE, top_p=TOP_P, max_tokens=MAX_TOKENS, ) - + print(f"\n{'=' * 70}") print("Running Inference") print("=" * 70) - + # Generate prompts prompts = generate_prompts(NUM_PROMPTS) - + # Warmup run (not timed) print("\nWarmup: Running 10 prompts...") warmup_prompts = prompts[:10] _ = llm.generate(warmup_prompts, sampling_params) print("✓ Warmup complete") - + # Benchmark run (timed) print(f"\nBenchmark: Running {NUM_PROMPTS} prompts...") start_time = time.time() - + outputs = llm.generate(prompts, sampling_params) - + end_time = time.time() elapsed_time = end_time - start_time - + # Calculate metrics total_tokens = sum(len(output.outputs[0].token_ids) for output in outputs) throughput = NUM_PROMPTS / elapsed_time tokens_per_second = total_tokens / elapsed_time - + print(f"\n{'=' * 70}") print("Benchmark Results") print("=" * 70) @@ -195,9 +200,11 @@ def run_inference(args): print(f"Token generation: {tokens_per_second:.2f} tokens/second") print(f"Average latency: {(elapsed_time / NUM_PROMPTS) * 1000:.2f} ms/request") if nnodes > 1: - print(f"(Multi-node TP+PP: single replica across {args.tensor_parallel_size * effective_pipeline_size} GPUs)") + print( + f"(Multi-node TP+PP: single replica across {args.tensor_parallel_size * effective_pipeline_size} GPUs)" + ) print("=" * 70) - + # Print sample outputs print("\n" + "=" * 70) print("Sample Outputs (first 3)") @@ -207,7 +214,7 @@ def run_inference(args): generated_text = output.outputs[0].text print(f"\n[Prompt {i+1}]: {prompt}") print(f"[Output {i+1}]: {generated_text[:200]}...") # First 200 chars - + # madengine output format print(f"\nperformance: {throughput:.2f} requests_per_second") print(f"tokens_per_second: {tokens_per_second:.2f}") @@ -217,7 +224,7 @@ def run_inference(args): if nnodes > 1: print(f"nnodes: {nnodes}") print(f"distributed_backend: {distributed_backend or 'default'}") - + return 0 @@ -230,47 +237,47 @@ def main(): "--model", type=str, default=DEFAULT_MODEL, - help=f"Model name or path (default: {DEFAULT_MODEL})" + help=f"Model name or path (default: {DEFAULT_MODEL})", ) parser.add_argument( "--tensor-parallel-size", type=int, default=1, - help="Number of GPUs for tensor parallelism (default: 1)" + help="Number of GPUs for tensor parallelism (default: 1)", ) parser.add_argument( "--pipeline-parallel-size", type=int, default=1, - help="Number of nodes for pipeline parallelism (default: 1)" + help="Number of nodes for pipeline parallelism (default: 1)", ) parser.add_argument( "--distributed-backend", type=str, choices=["auto", "ray", "mp", "none"], default="auto", - help="Distributed backend: auto (default), ray (multi-node), mp (multiprocessing), none" + help="Distributed backend: auto (default), ray (multi-node), mp (multiprocessing), none", ) parser.add_argument( "--enforce-eager", action="store_true", - help="Disable CUDA graph for compatibility" + help="Disable CUDA graph for compatibility", ) - + args = parser.parse_args() - + # Validate arguments if args.tensor_parallel_size < 1: print("Error: tensor-parallel-size must be >= 1") return 1 - + if args.pipeline_parallel_size < 1: print("Error: pipeline-parallel-size must be >= 1") return 1 - + # Print configuration print_header(args) - + # Run inference benchmark return run_inference(args) @@ -284,6 +291,6 @@ def main(): except Exception as e: print(f"\nError: {e}", file=sys.stderr) import traceback + traceback.print_exc() sys.exit(1) - diff --git a/tests/fixtures/dummy/scripts/pyt_huggingface_bert/run.sh b/tests/fixtures/dummy/scripts/pyt_huggingface_bert/run.sh index 8693dc66..5e06316d 100644 --- a/tests/fixtures/dummy/scripts/pyt_huggingface_bert/run.sh +++ b/tests/fixtures/dummy/scripts/pyt_huggingface_bert/run.sh @@ -62,6 +62,6 @@ torchrun $HF_PATH/examples/pytorch/language-modeling/run_mlm.py \ # output performance metric performance=$(cat log.txt | grep -Eo "train_samples_per_second':[^,]+" | sed "s/train_samples_per_second': //g" | head -n 1) -# unset printing trace to not confuse Jenkinsfile +# unset printing trace to not confuse Jenkinsfile set +x echo "performance: $performance samples_per_second" diff --git a/tests/fixtures/utils.py b/tests/fixtures/utils.py index 53cd6938..3d8c399f 100644 --- a/tests/fixtures/utils.py +++ b/tests/fixtures/utils.py @@ -5,15 +5,16 @@ # built-in modules import csv +import json import os import re import shutil import subprocess import sys -import json -import pytest from unittest.mock import MagicMock +import pytest + MODEL_DIR = "tests/fixtures/dummy" BASE_DIR = os.path.join(os.path.dirname(__file__), "..", "..") sys.path.insert(1, BASE_DIR) @@ -48,11 +49,11 @@ def has_gpu() -> bool: # This is safe for pytest collection and avoids hanging nvidia_exists = os.path.exists("/usr/bin/nvidia-smi") from madengine.core.constants import get_rocm_path + rocm_path = get_rocm_path() - amd_rocm_exists = ( - os.path.exists(os.path.join(rocm_path, "bin", "rocm-smi")) - or os.path.exists("/usr/local/bin/rocm-smi") - ) + amd_rocm_exists = os.path.exists( + os.path.join(rocm_path, "bin", "rocm-smi") + ) or os.path.exists("/usr/local/bin/rocm-smi") _has_gpu_cache = nvidia_exists or amd_rocm_exists @@ -89,18 +90,18 @@ def global_data(): def clean_test_temp_files(request): """ Fixture to clean up test temporary files and Docker containers. - + Cleans up both before (to ensure clean state) and after (to avoid conflicts). """ import subprocess - + # Clean up Docker containers BEFORE test (ensure clean state) try: subprocess.run( "docker ps -a | grep 'container_ci-dummy' | awk '{print $1}' | xargs -r docker rm -f", shell=True, capture_output=True, - timeout=30 + timeout=30, ) except Exception: pass # Ignore cleanup errors before test @@ -115,14 +116,14 @@ def clean_test_temp_files(request): shutil.rmtree(file_path) else: os.remove(file_path) - + # Clean up Docker containers AFTER test (avoid conflicts with next test) try: subprocess.run( "docker ps -a | grep 'container_ci-dummy' | awk '{print $1}' | xargs -r docker rm -f", shell=True, capture_output=True, - timeout=30 + timeout=30, ) except Exception: pass # Ignore cleanup errors after test @@ -184,13 +185,14 @@ def is_nvidia() -> bool: bool: True if NVIDIA GPU is present, False otherwise. """ global _gpu_vendor_cache - + if _gpu_vendor_cache is not None: return _gpu_vendor_cache == "NVIDIA" - + try: # Lazy import to avoid collection issues from madengine.core.context import Context + context = Context() _gpu_vendor_cache = context.ctx["gpu_vendor"] return _gpu_vendor_cache == "NVIDIA" @@ -216,6 +218,7 @@ def get_gpu_arch() -> str: try: from madengine.core.console import Console + console = Console(live_output=True) if is_nvidia(): arch = console.sh( @@ -224,9 +227,10 @@ def get_gpu_arch() -> str: arch = arch.strip() if arch else "" # Normalize "NVIDIA A100-SXM4-40GB" -> "A100", "NVIDIA H100 PCIe" -> "H100" if arch.startswith("NVIDIA "): - arch = arch[len("NVIDIA "):].split("-")[0].split()[0] + arch = arch[len("NVIDIA ") :].split("-")[0].split()[0] else: from madengine.core.constants import get_rocm_path + rocm_path = get_rocm_path() arch = console.sh(f"{rocm_path}/bin/rocminfo |grep -o -m 1 'gfx.*'") arch = arch.strip() if arch else "" @@ -244,17 +248,18 @@ def get_gpu_nodeid_map() -> dict: dict: GPU node id map mapping node_id strings to GPU indices. """ global _gpu_nodeid_map_cache - + if _gpu_nodeid_map_cache is not None: return _gpu_nodeid_map_cache - + try: # Lazy import to avoid collection issues from madengine.core.console import Console + gpu_map = {} console = Console(live_output=True) nvidia = is_nvidia() - + if nvidia: command = "nvidia-smi --list-gpus" output = console.sh(command) @@ -280,14 +285,18 @@ def get_gpu_nodeid_map() -> dict: # Parse version as tuple for proper comparison (6.4.1 vs 6.4.0) version_parts = rocm_version_str.split(".") if len(version_parts) >= 3: - rocm_version = tuple(int(p.split('-')[0]) for p in version_parts[:3]) + rocm_version = tuple( + int(p.split("-")[0]) for p in version_parts[:3] + ) else: # Fallback to float comparison for versions without patch rocm_version = (int(version_parts[0]), int(version_parts[1]), 0) - + # Use appropriate rocm-smi command based on version (PR #54: threshold is 6.4.1) command = ( - "rocm-smi --showuniqueid" if rocm_version < (6, 4, 1) else "rocm-smi --showhw" + "rocm-smi --showuniqueid" + if rocm_version < (6, 4, 1) + else "rocm-smi --showhw" ) output = console.sh(command) lines = output.split("\n") @@ -295,7 +304,9 @@ def get_gpu_nodeid_map() -> dict: for line in lines: if rocm_version < (6, 4, 1): if "Unique ID:" in line: - gpu_id = int(line.split(":")[0].split("[")[1].split("]")[0]) + gpu_id = int( + line.split(":")[0].split("[")[1].split("]")[0] + ) unique_id = line.split(":")[2].strip() gpu_map[unique_id] = gpu_id else: @@ -306,13 +317,22 @@ def get_gpu_nodeid_map() -> dict: except Exception: # If all else fails, return empty map pass - + _gpu_nodeid_map_cache = gpu_map return gpu_map - + except Exception: # If detection fails during collection, return a default mapping - _gpu_nodeid_map_cache = {'2': 0, '3': 1, '4': 2, '5': 3, '6': 4, '7': 5, '8': 6, '9': 7} + _gpu_nodeid_map_cache = { + "2": 0, + "3": 1, + "4": 2, + "5": 3, + "6": 4, + "7": 5, + "8": 6, + "9": 7, + } return _gpu_nodeid_map_cache @@ -323,10 +343,10 @@ def get_num_gpus() -> int: int: Number of GPUs present. """ global _num_gpus_cache - + if _num_gpus_cache is not None: return _num_gpus_cache - + try: gpu_map = get_gpu_nodeid_map() _num_gpus_cache = len(gpu_map) @@ -351,8 +371,11 @@ def get_num_cpus() -> int: try: # Lazy import to avoid collection issues from madengine.core.console import Console + console = Console(live_output=True) - _num_cpus_cache = int(console.sh("lscpu | grep \"^CPU(s):\" | awk '{print $2}'")) + _num_cpus_cache = int( + console.sh("lscpu | grep \"^CPU(s):\" | awk '{print $2}'") + ) return _num_cpus_cache except Exception: # Default to 64 CPUs if detection fails during collection @@ -423,7 +446,9 @@ def assert_model_in_perf_csv(csv_path, model, status="SUCCESS", performance=None pytest.fail( f"model {model} in perf CSV did not run successfully (status={row.get('status')})." ) - if performance is not None and str(row.get("performance", "")) != str(performance): + if performance is not None and str(row.get("performance", "")) != str( + performance + ): pytest.fail( f"model {model} expected performance {performance}, got {row.get('performance')}." ) diff --git a/tests/integration/test_container_execution.py b/tests/integration/test_container_execution.py index b0d11b3b..f70ba574 100644 --- a/tests/integration/test_container_execution.py +++ b/tests/integration/test_container_execution.py @@ -8,19 +8,21 @@ Copyright (c) Advanced Micro Devices, Inc. All rights reserved. """ +import json + # built-in modules import os -import json import unittest.mock -from unittest.mock import patch, MagicMock, mock_open +from unittest.mock import MagicMock, mock_open, patch # third-party modules import pytest +from madengine.core.console import Console +from madengine.core.context import Context + # project modules from madengine.execution.container_runner import ContainerRunner -from madengine.core.context import Context -from madengine.core.console import Console class TestContainerRunner: @@ -188,14 +190,17 @@ def test_run_container_success( mock_sh.return_value = "hostname" # Mock log file with performance metrics - log_content = "Running test...\nperformance: 100.5 samples_per_second\nTest completed" + log_content = ( + "Running test...\nperformance: 100.5 samples_per_second\nTest completed" + ) mock_file.return_value.read.return_value = log_content - + # Mock os.path.exists to return True for log file def exists_side_effect(path): if path.endswith(".live.log"): return True return False + mock_exists.side_effect = exists_side_effect model_info = { diff --git a/tests/integration/test_docker_integration.py b/tests/integration/test_docker_integration.py index 66455536..92d7396e 100644 --- a/tests/integration/test_docker_integration.py +++ b/tests/integration/test_docker_integration.py @@ -5,20 +5,22 @@ Copyright (c) Advanced Micro Devices, Inc. All rights reserved. """ +import json + # built-in modules import os -import json import shlex import tempfile -from unittest.mock import patch, MagicMock, mock_open +from unittest.mock import MagicMock, mock_open, patch # third-party modules import pytest +from madengine.core.console import Console +from madengine.core.context import Context + # project modules from madengine.execution.docker_builder import DockerBuilder -from madengine.core.context import Context -from madengine.core.console import Console class TestDockerBuilder: @@ -448,7 +450,10 @@ def test_export_build_manifest( # Set up some built images (key should match real DockerBuilder output) builder.built_images = { - "ci-model1": {"docker_image": "ci-model1", "dockerfile": "./docker/Dockerfile"} + "ci-model1": { + "docker_image": "ci-model1", + "dockerfile": "./docker/Dockerfile", + } } with patch("builtins.open", mock_open()) as mock_file: @@ -777,8 +782,8 @@ def test_build_manifest_with_tagged_image( mock_vendor, ): """Test that build manifest includes registry_image when pushing to registry.""" - import tempfile import os + import tempfile # Mock successful operations BEFORE creating Context # to avoid MagicMock objects being stored during initialization diff --git a/tests/integration/test_gpu_management.py b/tests/integration/test_gpu_management.py index 522fa038..95906050 100644 --- a/tests/integration/test_gpu_management.py +++ b/tests/integration/test_gpu_management.py @@ -28,13 +28,12 @@ from madengine.core.console import Console - - def is_amd_gpu(): """Check if system has AMD GPU.""" try: import subprocess - result = subprocess.run(['rocm-smi'], capture_output=True, timeout=5) + + result = subprocess.run(["rocm-smi"], capture_output=True, timeout=5) return result.returncode == 0 except Exception: return False @@ -44,226 +43,238 @@ def is_amd_gpu(): # GPU Tool Manager Tests # ============================================================================ + class TestBaseGPUToolManager: """Test the base GPU tool manager abstract class.""" - - class TestROCmToolManager: """Test the ROCm tool manager with 6.4.1 threshold (PR #54).""" - + def test_get_rocm_version_from_hipconfig(self): """Test ROCm version detection from hipconfig.""" manager = ROCmToolManager() - - with patch.object(manager, 'is_tool_available', return_value=True), \ - patch.object(manager, '_execute_shell_command') as mock_exec: + + with patch.object( + manager, "is_tool_available", return_value=True + ), patch.object(manager, "_execute_shell_command") as mock_exec: mock_exec.return_value = (True, "6.4.1-12345", "") - + version = manager.get_rocm_version() - + assert version == (6, 4, 1) # Verify result is cached assert manager._get_cached_result("rocm_version") == (6, 4, 1) - + def test_get_preferred_smi_tool_6_4_1_and_above(self): """Test that amd-smi is preferred for ROCm >= 6.4.1.""" manager = ROCmToolManager() - - with patch.object(manager, 'get_rocm_version', return_value=(6, 4, 1)): + + with patch.object(manager, "get_rocm_version", return_value=(6, 4, 1)): assert manager.get_preferred_smi_tool() == "amd-smi" - - with patch.object(manager, 'get_rocm_version', return_value=(6, 5, 0)): + + with patch.object(manager, "get_rocm_version", return_value=(6, 5, 0)): assert manager.get_preferred_smi_tool() == "amd-smi" - + def test_get_preferred_smi_tool_below_6_4_1(self): """Test that rocm-smi is preferred for ROCm < 6.4.1.""" manager = ROCmToolManager() - - with patch.object(manager, 'get_rocm_version', return_value=(6, 4, 0)): + + with patch.object(manager, "get_rocm_version", return_value=(6, 4, 0)): assert manager.get_preferred_smi_tool() == "rocm-smi" - - with patch.object(manager, 'get_rocm_version', return_value=(6, 3, 0)): + + with patch.object(manager, "get_rocm_version", return_value=(6, 3, 0)): assert manager.get_preferred_smi_tool() == "rocm-smi" - - with patch.object(manager, 'get_rocm_version', return_value=(5, 7, 0)): + + with patch.object(manager, "get_rocm_version", return_value=(5, 7, 0)): assert manager.get_preferred_smi_tool() == "rocm-smi" - + def test_get_gpu_count_with_amd_smi(self): """Test GPU count detection using amd-smi.""" manager = ROCmToolManager() - - with patch.object(manager, 'get_preferred_smi_tool', return_value="amd-smi"), \ - patch.object(manager, 'execute_command', return_value="8"): + + with patch.object( + manager, "get_preferred_smi_tool", return_value="amd-smi" + ), patch.object(manager, "execute_command", return_value="8"): count = manager.get_gpu_count() - + assert count == 8 # Verify caching assert manager._get_cached_result("gpu_count") == 8 - + def test_get_gpu_count_with_fallback_to_rocm_smi(self): """Test GPU count fallback from amd-smi to rocm-smi.""" manager = ROCmToolManager() - + def mock_execute(command, fallback=None, timeout=30): # Simulate amd-smi failure, rocm-smi success if "amd-smi" in command: raise RuntimeError("amd-smi not found") return "4" - - with patch.object(manager, 'get_preferred_smi_tool', return_value="amd-smi"), \ - patch.object(manager, 'execute_command', side_effect=mock_execute): + + with patch.object( + manager, "get_preferred_smi_tool", return_value="amd-smi" + ), patch.object(manager, "execute_command", side_effect=mock_execute): # Should fallback successfully - with pytest.raises(RuntimeError): # Our mock raises, but real impl would fallback + with pytest.raises( + RuntimeError + ): # Our mock raises, but real impl would fallback manager.get_gpu_count() - + def test_get_gpu_product_name_with_fallback(self): """Test GPU product name with rocm-smi fallback (PR #54).""" manager = ROCmToolManager() - - with patch.object(manager, 'get_preferred_smi_tool', return_value="amd-smi"), \ - patch.object(manager, 'execute_command', return_value="AMD Instinct MI300X"): + + with patch.object( + manager, "get_preferred_smi_tool", return_value="amd-smi" + ), patch.object(manager, "execute_command", return_value="AMD Instinct MI300X"): product = manager.get_gpu_product_name(gpu_id=0) - + assert product == "AMD Instinct MI300X" - assert manager._get_cached_result("gpu_product_name:0") == "AMD Instinct MI300X" - + assert ( + manager._get_cached_result("gpu_product_name:0") + == "AMD Instinct MI300X" + ) + def test_get_gpu_architecture(self): """Test GPU architecture detection via rocminfo.""" manager = ROCmToolManager() - - with patch.object(manager, '_execute_shell_command') as mock_exec: + + with patch.object(manager, "_execute_shell_command") as mock_exec: mock_exec.return_value = (True, "gfx942", "") - + arch = manager.get_gpu_architecture() - + assert arch == "gfx942" assert manager._get_cached_result("gpu_architecture") == "gfx942" - + def test_execute_command_with_fallback(self): """Test command execution with fallback mechanism.""" manager = ROCmToolManager() - - with patch.object(manager, '_execute_shell_command') as mock_exec: + + with patch.object(manager, "_execute_shell_command") as mock_exec: # First call fails, second succeeds mock_exec.side_effect = [ (False, "", "command not found"), - (True, "success", "") + (True, "success", ""), ] - + result = manager.execute_command("primary_cmd", "fallback_cmd") - + assert result == "success" assert mock_exec.call_count == 2 - - class TestNvidiaToolManager: """Test the NVIDIA tool manager.""" - + def test_initialization(self): """Test NVIDIA tool manager initialization.""" manager = NvidiaToolManager() assert manager is not None - + def test_get_cuda_version_from_nvcc(self): """Test CUDA version detection from nvcc.""" manager = NvidiaToolManager() - - with patch.object(manager, 'is_tool_available', return_value=True), \ - patch.object(manager, '_execute_shell_command') as mock_exec: + + with patch.object( + manager, "is_tool_available", return_value=True + ), patch.object(manager, "_execute_shell_command") as mock_exec: mock_exec.return_value = (True, "12.0", "") - + version = manager.get_cuda_version() - + assert version == "12.0" assert manager._get_cached_result("cuda_version") == "12.0" - + def test_get_driver_version(self): """Test NVIDIA driver version detection.""" manager = NvidiaToolManager() - - with patch.object(manager, 'is_tool_available', return_value=True), \ - patch.object(manager, '_execute_shell_command') as mock_exec: + + with patch.object( + manager, "is_tool_available", return_value=True + ), patch.object(manager, "_execute_shell_command") as mock_exec: mock_exec.return_value = (True, "525.60.13", "") - + version = manager.get_driver_version() - + assert version == "525.60.13" - + def test_execute_nvidia_smi(self): """Test nvidia-smi execution.""" manager = NvidiaToolManager() - - with patch.object(manager, 'is_tool_available', return_value=True), \ - patch.object(manager, 'execute_command', return_value="GPU info"): + + with patch.object( + manager, "is_tool_available", return_value=True + ), patch.object(manager, "execute_command", return_value="GPU info"): result = manager.execute_nvidia_smi("--list-gpus") - + assert result == "GPU info" - + def test_get_gpu_count(self): """Test NVIDIA GPU count detection.""" manager = NvidiaToolManager() - - with patch.object(manager, 'execute_nvidia_smi', return_value="8"): + + with patch.object(manager, "execute_nvidia_smi", return_value="8"): count = manager.get_gpu_count() - + assert count == 8 - + def test_get_gpu_product_name(self): """Test NVIDIA GPU product name detection.""" manager = NvidiaToolManager() - - with patch.object(manager, 'execute_nvidia_smi', return_value="NVIDIA H100 80GB HBM3"): - product = manager.get_gpu_product_name(gpu_id=0) - - assert product == "NVIDIA H100 80GB HBM3" + with patch.object( + manager, "execute_nvidia_smi", return_value="NVIDIA H100 80GB HBM3" + ): + product = manager.get_gpu_product_name(gpu_id=0) + assert product == "NVIDIA H100 80GB HBM3" class TestGPUToolFactory: """Test the GPU tool factory with singleton pattern.""" - + def setup_method(self): """Clear factory cache before each test.""" clear_manager_cache() - + def teardown_method(self): """Clear factory cache after each test.""" clear_manager_cache() - + def test_get_amd_manager(self): """Test getting AMD tool manager.""" - with patch('madengine.utils.gpu_validator.detect_gpu_vendor', return_value=GPUVendor.AMD): + with patch( + "madengine.utils.gpu_validator.detect_gpu_vendor", + return_value=GPUVendor.AMD, + ): manager = get_gpu_tool_manager(GPUVendor.AMD) - + assert isinstance(manager, ROCmToolManager) - + def test_get_nvidia_manager(self): """Test getting NVIDIA tool manager.""" manager = get_gpu_tool_manager(GPUVendor.NVIDIA) - + assert isinstance(manager, NvidiaToolManager) - + def test_singleton_pattern(self): """Test that factory returns same instance (singleton).""" manager1 = get_gpu_tool_manager(GPUVendor.AMD) manager2 = get_gpu_tool_manager(GPUVendor.AMD) - + assert manager1 is manager2 # Same instance - + def test_different_vendors_different_instances(self): """Test that different vendors get different instances.""" amd_manager = get_gpu_tool_manager(GPUVendor.AMD) nvidia_manager = get_gpu_tool_manager(GPUVendor.NVIDIA) - + assert amd_manager is not nvidia_manager assert isinstance(amd_manager, ROCmToolManager) assert isinstance(nvidia_manager, NvidiaToolManager) - + def test_auto_detect_vendor(self): """Test auto-detection of GPU vendor.""" with patch( @@ -273,23 +284,23 @@ def test_auto_detect_vendor(self): manager = get_gpu_tool_manager(vendor=None) assert isinstance(manager, ROCmToolManager) - + def test_unknown_vendor_raises_error(self): """Test that unknown vendor raises appropriate error.""" with pytest.raises(ValueError, match="Unable to detect GPU vendor"): get_gpu_tool_manager(GPUVendor.UNKNOWN) - + def test_clear_manager_cache(self): """Test clearing manager cache.""" manager1 = get_gpu_tool_manager(GPUVendor.AMD) - + clear_manager_cache() - + manager2 = get_gpu_tool_manager(GPUVendor.AMD) - + # After clearing cache, should get new instance assert manager1 is not manager2 - + def test_get_cached_managers(self): """Test getting dictionary of cached managers.""" amd_manager = get_gpu_tool_manager(GPUVendor.AMD) @@ -307,125 +318,124 @@ def test_get_cached_managers(self): assert cached[nvidia_keys[0]] is nvidia_manager - - class TestToolManagerIntegration: """Integration tests for tool managers with Context.""" - + def test_context_uses_tool_manager_for_gpu_count(self): """Test that Context uses tool manager for GPU count.""" from madengine.core.context import Context - - additional_context = json.dumps({ - "gpu_vendor": "AMD", - "guest_os": "UBUNTU" - }) - - with patch('madengine.core.context.Context.get_gpu_vendor', return_value="AMD"), \ - patch('madengine.core.context.Context._get_tool_manager') as mock_get_manager: - + + additional_context = json.dumps({"gpu_vendor": "AMD", "guest_os": "UBUNTU"}) + + with patch( + "madengine.core.context.Context.get_gpu_vendor", return_value="AMD" + ), patch( + "madengine.core.context.Context._get_tool_manager" + ) as mock_get_manager: + mock_manager = Mock() mock_manager.get_gpu_count.return_value = 8 mock_get_manager.return_value = mock_manager - + context = Context( - additional_context=additional_context, - build_only_mode=True + additional_context=additional_context, build_only_mode=True ) - + # Force initialization of docker_env_vars context.ctx["docker_env_vars"] = {"MAD_GPU_VENDOR": "AMD"} - + count = context.get_system_ngpus() - + assert count == 8 mock_manager.get_gpu_count.assert_called_once() - + def test_context_uses_tool_manager_for_product_name(self): """Test that Context uses tool manager for GPU product name (PR #54).""" from madengine.core.context import Context - - additional_context = json.dumps({ - "gpu_vendor": "AMD", - "guest_os": "UBUNTU" - }) - - with patch('madengine.core.context.Context._get_tool_manager') as mock_get_manager: + + additional_context = json.dumps({"gpu_vendor": "AMD", "guest_os": "UBUNTU"}) + + with patch( + "madengine.core.context.Context._get_tool_manager" + ) as mock_get_manager: mock_manager = Mock() mock_manager.get_gpu_product_name.return_value = "AMD Instinct MI300X" mock_get_manager.return_value = mock_manager - + context = Context( - additional_context=additional_context, - build_only_mode=True + additional_context=additional_context, build_only_mode=True ) - + context.ctx["docker_env_vars"] = {"MAD_GPU_VENDOR": "AMD"} - + product = context.get_system_gpu_product_name() - + assert product == "AMD Instinct MI300X" mock_manager.get_gpu_product_name.assert_called_once_with(gpu_id=0) - - class TestPR54Compliance: """Test compliance with PR #54 requirements.""" - + def test_rocm_version_threshold_is_6_4_1(self): """Test that ROCm version threshold matches PR #54.""" - assert ROCM_VERSION_THRESHOLD == (6, 4, 1), \ - "ROCm version threshold must be 6.4.1 as per PR #54" - + assert ROCM_VERSION_THRESHOLD == ( + 6, + 4, + 1, + ), "ROCm version threshold must be 6.4.1 as per PR #54" + def test_amd_smi_preferred_for_6_4_1_and_above(self): """Test amd-smi is preferred for ROCm >= 6.4.1 (PR #54).""" manager = ROCmToolManager() - + test_versions = [ ((6, 4, 1), "amd-smi"), ((6, 4, 2), "amd-smi"), ((6, 5, 0), "amd-smi"), ((7, 0, 0), "amd-smi"), ] - + for version, expected_tool in test_versions: - with patch.object(manager, 'get_rocm_version', return_value=version): + with patch.object(manager, "get_rocm_version", return_value=version): tool = manager.get_preferred_smi_tool() - assert tool == expected_tool, \ - f"ROCm {version} should prefer {expected_tool}" - + assert ( + tool == expected_tool + ), f"ROCm {version} should prefer {expected_tool}" + def test_rocm_smi_used_for_below_6_4_1(self): """Test rocm-smi is used for ROCm < 6.4.1 (PR #54).""" manager = ROCmToolManager() - + test_versions = [ ((6, 4, 0), "rocm-smi"), ((6, 3, 0), "rocm-smi"), ((6, 0, 0), "rocm-smi"), ((5, 7, 0), "rocm-smi"), ] - + for version, expected_tool in test_versions: - with patch.object(manager, 'get_rocm_version', return_value=version): + with patch.object(manager, "get_rocm_version", return_value=version): tool = manager.get_preferred_smi_tool() - assert tool == expected_tool, \ - f"ROCm {version} should use {expected_tool}" - + assert ( + tool == expected_tool + ), f"ROCm {version} should use {expected_tool}" + def test_gpu_product_name_has_fallback(self): """Test GPU product name has rocm-smi fallback (PR #54).""" manager = ROCmToolManager() - + # Verify the method supports fallback by checking it calls execute_command - with patch.object(manager, 'get_preferred_smi_tool', return_value="amd-smi"), \ - patch.object(manager, 'execute_command') as mock_exec: + with patch.object( + manager, "get_preferred_smi_tool", return_value="amd-smi" + ), patch.object(manager, "execute_command") as mock_exec: mock_exec.return_value = "AMD Instinct MI300X" - + product = manager.get_gpu_product_name(0) - + # Verify execute_command was called (which has fallback logic) mock_exec.assert_called_once() - + # Verify both amd-smi and rocm-smi commands are in the call call_args = mock_exec.call_args assert "amd-smi" in str(call_args) or "rocm-smi" in str(call_args) @@ -435,12 +445,11 @@ def test_gpu_product_name_has_fallback(self): pytest.main([__file__, "-v"]) - - # ============================================================================ # GPU RenderD Nodes Tests # ============================================================================ + class TestGetGpuRenderDNodesIntegration: """Integration test suite for the get_gpu_renderD_nodes method using real hardware.""" @@ -449,13 +458,19 @@ def test_returns_none_for_non_amd_gpu(self): """Test that the function returns None for non-AMD GPUs.""" from unittest.mock import patch - with patch.object(Context, "get_gpu_vendor", return_value="NVIDIA"), \ - patch.object(Context, "get_system_ngpus", return_value=0), \ - patch.object(Context, "get_system_gpu_architecture", return_value=""), \ - patch.object(Context, "get_system_gpu_product_name", return_value=""), \ - patch.object(Context, "get_system_hip_version", return_value="5.0"), \ - patch.object(Context, "get_docker_gpus", return_value="0"), \ - patch.object(Context, "get_gpu_renderD_nodes", return_value=None): + with patch.object( + Context, "get_gpu_vendor", return_value="NVIDIA" + ), patch.object(Context, "get_system_ngpus", return_value=0), patch.object( + Context, "get_system_gpu_architecture", return_value="" + ), patch.object( + Context, "get_system_gpu_product_name", return_value="" + ), patch.object( + Context, "get_system_hip_version", return_value="5.0" + ), patch.object( + Context, "get_docker_gpus", return_value="0" + ), patch.object( + Context, "get_gpu_renderD_nodes", return_value=None + ): context = Context() # Should return None for non-AMD GPUs @@ -466,139 +481,149 @@ def test_returns_none_for_non_amd_gpu(self): def test_returns_list_for_amd_gpu(self): """Test that the function returns a list of renderD nodes for AMD GPUs.""" context = Context() - + # Should return a list for AMD GPUs - assert context.ctx['gpu_renderDs'] is not None - assert isinstance(context.ctx['gpu_renderDs'], list) - + assert context.ctx["gpu_renderDs"] is not None + assert isinstance(context.ctx["gpu_renderDs"], list) + # List should not be empty if there are GPUs - if context.ctx['docker_env_vars']['MAD_SYSTEM_NGPUS'] > 0: - assert len(context.ctx['gpu_renderDs']) > 0 + if context.ctx["docker_env_vars"]["MAD_SYSTEM_NGPUS"] > 0: + assert len(context.ctx["gpu_renderDs"]) > 0 @pytest.mark.skipif(not is_amd_gpu(), reason="Test requires AMD GPU") def test_renderD_count_matches_gpu_count(self): """Test that the number of renderD nodes matches the number of GPUs.""" context = Context() - + # Get GPU count from context (which uses amd-smi list --csv or rocm-smi as fallback) # This is more reliable than amd-smi list -e --json which only works on ROCm 6.4+ - expected_gpu_count = context.ctx['docker_env_vars']['MAD_SYSTEM_NGPUS'] - + expected_gpu_count = context.ctx["docker_env_vars"]["MAD_SYSTEM_NGPUS"] + # Skip test if no GPUs detected if expected_gpu_count == 0: pytest.skip("No GPUs detected on system") - + # The number of renderD nodes should match the number of GPUs - assert len(context.ctx['gpu_renderDs']) == expected_gpu_count - + assert len(context.ctx["gpu_renderDs"]) == expected_gpu_count + @pytest.mark.skipif(not is_amd_gpu(), reason="Test requires AMD GPU") def test_renderD_values_are_valid(self): """Test that all renderD values are valid integers.""" context = Context() - + # All renderD values should be positive integers - for renderD in context.ctx['gpu_renderDs']: + for renderD in context.ctx["gpu_renderDs"]: assert isinstance(renderD, int) assert renderD > 0 - + @pytest.mark.skipif(not is_amd_gpu(), reason="Test requires AMD GPU") def test_renderD_nodes_are_unique(self): """Test that all renderD nodes are unique.""" context = Context() - - renderDs = context.ctx['gpu_renderDs'] + + renderDs = context.ctx["gpu_renderDs"] # All renderD values should be unique assert len(renderDs) == len(set(renderDs)) - + @pytest.mark.skipif(not is_amd_gpu(), reason="Test requires AMD GPU") def test_renderD_values_match_kfd_properties(self): """Test that renderD values match what's in KFD properties.""" console = Console() context = Context() - + # Get renderD values from KFD directly try: - kfd_output = console.sh("grep -r drm_render_minor /sys/devices/virtual/kfd/kfd/topology/nodes") + kfd_output = console.sh( + "grep -r drm_render_minor /sys/devices/virtual/kfd/kfd/topology/nodes" + ) kfd_lines = [line for line in kfd_output.split("\n") if line.strip()] # Filter out CPU entries (renderD value 0) - kfd_renderDs = [int(line.split()[-1]) for line in kfd_lines if int(line.split()[-1]) != 0] + kfd_renderDs = [ + int(line.split()[-1]) + for line in kfd_lines + if int(line.split()[-1]) != 0 + ] except Exception: pytest.skip("Unable to read KFD properties") - + # The renderD values from context should be a subset of KFD renderDs - for renderD in context.ctx['gpu_renderDs']: - assert renderD in kfd_renderDs, f"renderD {renderD} not found in KFD properties" - + for renderD in context.ctx["gpu_renderDs"]: + assert ( + renderD in kfd_renderDs + ), f"renderD {renderD} not found in KFD properties" + @pytest.mark.skipif(not is_amd_gpu(), reason="Test requires AMD GPU") def test_gpu_ordering_is_consistent(self): """Test that GPU ordering matches amd-smi GPU IDs.""" console = Console() context = Context() - + try: # Get amd-smi data amd_smi_output = console.sh("amd-smi list -e --json") gpu_data = json.loads(amd_smi_output) - + # Sort by GPU ID sorted_gpus = sorted(gpu_data, key=lambda x: x["gpu"]) - + # The number of GPUs should match - assert len(context.ctx['gpu_renderDs']) == len(sorted_gpus) - + assert len(context.ctx["gpu_renderDs"]) == len(sorted_gpus) + except Exception: pytest.skip("Unable to verify GPU ordering with amd-smi") - + @pytest.mark.skipif(not is_amd_gpu(), reason="Test requires AMD GPU") def test_renderD_nodes_exist_in_dev(self): """Test that the renderD nodes actually exist in /dev/dri/.""" context = Context() - + # Check that each renderD node exists as a device file - for renderD in context.ctx['gpu_renderDs']: + for renderD in context.ctx["gpu_renderDs"]: dev_path = f"/dev/dri/renderD{renderD}" assert os.path.exists(dev_path), f"Device {dev_path} does not exist" # Should be a character device - assert stat.S_ISCHR(os.stat(dev_path).st_mode), f"{dev_path} is not a character device" - + assert stat.S_ISCHR( + os.stat(dev_path).st_mode + ), f"{dev_path} is not a character device" + @pytest.mark.skipif(not is_amd_gpu(), reason="Test requires AMD GPU") def test_no_cpu_entries_in_renderDs(self): """Test that CPU entries (renderD=0) are not included.""" context = Context() - + # None of the renderD values should be 0 (CPUs) - for renderD in context.ctx['gpu_renderDs']: + for renderD in context.ctx["gpu_renderDs"]: assert renderD != 0, "CPU entry (renderD=0) found in GPU renderD list" - + @pytest.mark.skipif(not is_amd_gpu(), reason="Test requires AMD GPU") def test_context_initialization_succeeds(self): """Test that Context initialization succeeds with real GPU data.""" # This should not raise any exceptions context = Context() - + # Basic sanity checks assert context.ctx is not None - assert 'gpu_renderDs' in context.ctx - assert 'docker_env_vars' in context.ctx - assert 'MAD_GPU_VENDOR' in context.ctx['docker_env_vars'] - + assert "gpu_renderDs" in context.ctx + assert "docker_env_vars" in context.ctx + assert "MAD_GPU_VENDOR" in context.ctx["docker_env_vars"] + @pytest.mark.skipif(not is_amd_gpu(), reason="Test requires AMD GPU") def test_renderD_mapping_is_reproducible(self): """Test that creating multiple Context objects produces the same renderD mapping.""" context1 = Context() context2 = Context() - + # The renderD lists should be identical - assert context1.ctx['gpu_renderDs'] == context2.ctx['gpu_renderDs'] - + assert context1.ctx["gpu_renderDs"] == context2.ctx["gpu_renderDs"] + @pytest.mark.skipif(not is_amd_gpu(), reason="Test requires AMD GPU") def test_renderD_values_are_in_valid_range(self): """Test that renderD values are in the valid Linux device range.""" context = Context() - + # renderD values typically start at 128 and go up # Valid range is 128-255 for render nodes - for renderD in context.ctx['gpu_renderDs']: - assert 128 <= renderD <= 255, f"renderD {renderD} is outside valid range [128, 255]" - - + for renderD in context.ctx["gpu_renderDs"]: + assert ( + 128 <= renderD <= 255 + ), f"renderD {renderD} is outside valid range [128, 255]" diff --git a/tests/integration/test_orchestrator_workflows.py b/tests/integration/test_orchestrator_workflows.py index 143e56e5..09264d80 100644 --- a/tests/integration/test_orchestrator_workflows.py +++ b/tests/integration/test_orchestrator_workflows.py @@ -18,15 +18,15 @@ # project modules from madengine.cli import app +from madengine.core.errors import BuildError, ConfigurationError, DiscoveryError from madengine.orchestration.build_orchestrator import BuildOrchestrator from madengine.orchestration.run_orchestrator import RunOrchestrator -from madengine.core.errors import BuildError, ConfigurationError, DiscoveryError - # ============================================================================ # Batch manifest (CLI build options) # ============================================================================ + class TestBatchManifestBuildIntegration: """Batch manifest and --tags are mutually exclusive.""" @@ -42,9 +42,12 @@ def test_batch_manifest_mutually_exclusive_with_tags(self): app, [ "build", - "--batch-manifest", batch_file, - "--tags", "dummy", - "--additional-context", '{"gpu_vendor": "AMD", "guest_os": "UBUNTU"}', + "--batch-manifest", + batch_file, + "--tags", + "dummy", + "--additional-context", + '{"gpu_vendor": "AMD", "guest_os": "UBUNTU"}', ], ) assert result.exit_code != 0 @@ -57,6 +60,7 @@ def test_batch_manifest_mutually_exclusive_with_tags(self): # Build orchestrator # ============================================================================ + class TestBuildOrchestrator: """Test the Build Orchestrator module.""" @@ -163,7 +167,9 @@ def test_build_execute_success( # Mock context mock_context = MagicMock() - mock_context.ctx = {"docker_build_arg": {"MAD_SYSTEM_GPU_ARCHITECTURE": "gfx90a"}} + mock_context.ctx = { + "docker_build_arg": {"MAD_SYSTEM_GPU_ARCHITECTURE": "gfx90a"} + } mock_context_class.return_value = mock_context # Mock discover models @@ -356,9 +362,7 @@ def test_run_execute_no_manifest_no_tags(self, mock_exists): orchestrator.execute(manifest_file=None, tags=None) @patch("madengine.orchestration.build_orchestrator.BuildOrchestrator") - def test_run_execute_triggers_build_phase( - self, mock_build_orchestrator - ): + def test_run_execute_triggers_build_phase(self, mock_build_orchestrator): """Test run execution triggers build phase when no manifest exists.""" mock_args = MagicMock() mock_args.additional_context = None @@ -378,9 +382,13 @@ def test_run_execute_triggers_build_phase( orchestrator = RunOrchestrator(mock_args) # Mock file operations and execution - with patch("os.path.exists", side_effect=lambda p: p == "build_manifest.json"), \ - patch("builtins.open", mock_open(read_data=json.dumps(manifest_data))), \ - patch.object(orchestrator, "_execute_local", return_value={}) as mock_execute_local: + with patch( + "os.path.exists", side_effect=lambda p: p == "build_manifest.json" + ), patch( + "builtins.open", mock_open(read_data=json.dumps(manifest_data)) + ), patch.object( + orchestrator, "_execute_local", return_value={} + ) as mock_execute_local: orchestrator.execute(manifest_file=None, tags=["test"]) mock_build_instance.execute.assert_called_once() @@ -442,9 +450,7 @@ def test_run_execute_distributed(self, mock_exists, mock_file): read_data='{"built_images": {"model1": {"name": "model1"}}, "context": {}}', ) @patch("os.path.exists", return_value=True) - def test_execute_local_with_mock( - self, mock_exists, mock_file - ): + def test_execute_local_with_mock(self, mock_exists, mock_file): """Test local execution workflow (mocked).""" mock_args = MagicMock() mock_args.additional_context = '{"deploy": "local"}' @@ -470,9 +476,20 @@ def test_filter_images_by_gpu_architecture(self): orchestrator = RunOrchestrator(mock_args) built_images = { - "model1": {"name": "model1", "gpu_architecture": "gfx90a", "gpu_vendor": "AMD"}, - "model2": {"name": "model2", "gpu_architecture": "gfx908", "gpu_vendor": "AMD"}, - "model3": {"name": "model3", "gpu_architecture": ""}, # Legacy - no gpu_vendor + "model1": { + "name": "model1", + "gpu_architecture": "gfx90a", + "gpu_vendor": "AMD", + }, + "model2": { + "name": "model2", + "gpu_architecture": "gfx908", + "gpu_vendor": "AMD", + }, + "model3": { + "name": "model3", + "gpu_architecture": "", + }, # Legacy - no gpu_vendor } # Filter for gfx90a @@ -483,4 +500,3 @@ def test_filter_images_by_gpu_architecture(self): assert "model1" in compatible assert "model2" not in compatible assert "model3" in compatible # Legacy images without gpu_vendor pass through - diff --git a/tests/integration/test_platform_integration.py b/tests/integration/test_platform_integration.py index 82519a89..c13b031e 100644 --- a/tests/integration/test_platform_integration.py +++ b/tests/integration/test_platform_integration.py @@ -10,25 +10,26 @@ import json import os from pathlib import Path -from unittest.mock import MagicMock, patch, mock_open +from unittest.mock import MagicMock, mock_open, patch + import pytest +from madengine.core.errors import BuildError from madengine.execution.docker_builder import DockerBuilder from madengine.execution.dockerfile_utils import ( - parse_dockerfile_gpu_variables, - normalize_architecture_name, - is_target_arch_compatible_with_variable, is_compilation_arch_compatible, + is_target_arch_compatible_with_variable, + normalize_architecture_name, + parse_dockerfile_gpu_variables, ) from madengine.orchestration.build_orchestrator import BuildOrchestrator from madengine.orchestration.run_orchestrator import RunOrchestrator -from madengine.core.errors import BuildError - # ============================================================================ # Multi-Platform Build Tests # ============================================================================ + class TestMultiPlatformBuild: """Test build orchestration across different platforms.""" @@ -44,14 +45,16 @@ def test_build_initialization_all_platforms( ): with patch("os.path.exists", return_value=False): orchestrator = BuildOrchestrator(mock_build_args) - + assert orchestrator.args == mock_build_args assert orchestrator.context == multi_platform_context assert orchestrator.credentials is None @pytest.mark.unit @pytest.mark.amd - def test_build_amd_gpu_architecture_detection(self, amd_gpu_context, mock_build_args): + def test_build_amd_gpu_architecture_detection( + self, amd_gpu_context, mock_build_args + ): """Test AMD GPU architecture is correctly detected and used.""" with patch( "madengine.orchestration.build_orchestrator.Context", @@ -59,7 +62,7 @@ def test_build_amd_gpu_architecture_detection(self, amd_gpu_context, mock_build_ ): with patch("os.path.exists", return_value=False): orchestrator = BuildOrchestrator(mock_build_args) - + assert orchestrator.context.get_gpu_vendor() == "AMD" assert orchestrator.context.get_system_gpu_architecture() == "gfx90a" @@ -75,7 +78,7 @@ def test_build_nvidia_gpu_architecture_detection( ): with patch("os.path.exists", return_value=False): orchestrator = BuildOrchestrator(mock_build_args) - + assert orchestrator.context.get_gpu_vendor() == "NVIDIA" assert orchestrator.context.get_system_gpu_architecture() == "sm_90" @@ -89,7 +92,7 @@ def test_build_cpu_only_mode(self, cpu_context, mock_build_args): ): with patch("os.path.exists", return_value=False): orchestrator = BuildOrchestrator(mock_build_args) - + assert orchestrator.context.get_gpu_vendor() == "NONE" assert orchestrator.context.get_system_ngpus() == 0 @@ -98,6 +101,7 @@ def test_build_cpu_only_mode(self, cpu_context, mock_build_args): # Error Handling and Resilience Tests # ============================================================================ + class TestBuildResilience: """Test build resilience and error handling.""" @@ -141,7 +145,9 @@ def test_partial_build_failure_saves_manifest( mock_builder_instance.export_build_manifest.assert_called_once() # Verify successful builds are available - summary = mock_builder_instance.build_all_models.return_value + summary = ( + mock_builder_instance.build_all_models.return_value + ) assert len(summary["successful_builds"]) == 1 assert len(summary["failed_builds"]) == 1 @@ -244,6 +250,7 @@ def test_multi_model_build_continues_on_single_failure( # Multi-Architecture Build Tests # ============================================================================ + class TestMultiArchitectureBuild: """Test multi-architecture build scenarios.""" @@ -302,7 +309,9 @@ def test_multi_arch_amd_builds(self, mock_build_args, amd_gpu_context): manifest_file = orchestrator.execute() # Verify all architectures were built - summary = mock_builder_instance.build_all_models.return_value + summary = ( + mock_builder_instance.build_all_models.return_value + ) assert len(summary["successful_builds"]) == 3 archs = [ b["gpu_architecture"] @@ -317,6 +326,7 @@ def test_multi_arch_amd_builds(self, mock_build_args, amd_gpu_context): # Run Orchestrator Multi-Platform Tests # ============================================================================ + class TestMultiPlatformRun: """Test run orchestration across different platforms.""" @@ -397,6 +407,7 @@ def test_run_multi_model_continues_on_failure( # Integration Tests (Full Flow) # ============================================================================ + class TestEndToEndIntegration: """Integration tests for complete build + run workflows.""" @@ -465,7 +476,9 @@ def test_build_then_run_workflow( "total_runs": 1, }, ): - result = run_orchestrator.execute(manifest_file="build_manifest.json") + result = run_orchestrator.execute( + manifest_file="build_manifest.json" + ) assert len(result["successful_runs"]) == 1 assert len(result["failed_runs"]) == 0 @@ -475,6 +488,7 @@ def test_build_then_run_workflow( # Platform-Specific Behavior Tests # ============================================================================ + class TestPlatformSpecificBehavior: """Test platform-specific behaviors and edge cases.""" @@ -553,6 +567,7 @@ def test_cpu_only_execution(self, cpu_context, mock_run_args, temp_manifest_file # Multi-GPU architecture (Dockerfile parsing, normalization, image filtering) # ============================================================================ + class TestMultiGPUArch: """Multi-arch DockerBuilder logic, dockerfile_utils, and run-phase image filtering.""" @@ -573,27 +588,46 @@ def setup_method(self): @patch.object(DockerBuilder, "_get_dockerfiles_for_model") @patch.object(DockerBuilder, "_check_dockerfile_has_gpu_variables") @patch.object(DockerBuilder, "build_image") - def test_multi_arch_build_image_naming(self, mock_build_image, mock_check_gpu_vars, mock_get_dockerfiles): + def test_multi_arch_build_image_naming( + self, mock_build_image, mock_check_gpu_vars, mock_get_dockerfiles + ): model_info = {"name": "dummy", "dockerfile": "docker/dummy.Dockerfile"} mock_get_dockerfiles.return_value = ["docker/dummy.Dockerfile"] mock_check_gpu_vars.return_value = (True, "docker/dummy.Dockerfile") - mock_build_image.return_value = {"docker_image": "ci-dummy_dummy.ubuntu.amd_gfx908", "build_duration": 1.0} - result = self.builder._build_model_for_arch(model_info, "gfx908", None, False, None, "", None) + mock_build_image.return_value = { + "docker_image": "ci-dummy_dummy.ubuntu.amd_gfx908", + "build_duration": 1.0, + } + result = self.builder._build_model_for_arch( + model_info, "gfx908", None, False, None, "", None + ) assert result[0]["docker_image"].endswith("_gfx908") mock_check_gpu_vars.return_value = (False, "docker/dummy.Dockerfile") - mock_build_image.return_value = {"docker_image": "ci-dummy_dummy.ubuntu.amd", "build_duration": 1.0} - result = self.builder._build_model_for_arch(model_info, "gfx908", None, False, None, "", None) + mock_build_image.return_value = { + "docker_image": "ci-dummy_dummy.ubuntu.amd", + "build_duration": 1.0, + } + result = self.builder._build_model_for_arch( + model_info, "gfx908", None, False, None, "", None + ) assert not result[0]["docker_image"].endswith("_gfx908") @patch.object(DockerBuilder, "_get_dockerfiles_for_model") @patch.object(DockerBuilder, "_check_dockerfile_has_gpu_variables") @patch.object(DockerBuilder, "build_image") - def test_multi_arch_manifest_fields(self, mock_build_image, mock_check_gpu_vars, mock_get_dockerfiles): + def test_multi_arch_manifest_fields( + self, mock_build_image, mock_check_gpu_vars, mock_get_dockerfiles + ): model_info = {"name": "dummy", "dockerfile": "docker/dummy.Dockerfile"} mock_get_dockerfiles.return_value = ["docker/dummy.Dockerfile"] mock_check_gpu_vars.return_value = (True, "docker/dummy.Dockerfile") - mock_build_image.return_value = {"docker_image": "ci-dummy_dummy.ubuntu.amd_gfx908", "build_duration": 1.0} - result = self.builder._build_model_for_arch(model_info, "gfx908", None, False, None, "", None) + mock_build_image.return_value = { + "docker_image": "ci-dummy_dummy.ubuntu.amd_gfx908", + "build_duration": 1.0, + } + result = self.builder._build_model_for_arch( + model_info, "gfx908", None, False, None, "", None + ) assert result[0]["gpu_architecture"] == "gfx908" @patch.object(DockerBuilder, "_get_dockerfiles_for_model") @@ -601,17 +635,31 @@ def test_multi_arch_manifest_fields(self, mock_build_image, mock_check_gpu_vars, def test_legacy_single_arch_build(self, mock_build_image, mock_get_dockerfiles): model_info = {"name": "dummy", "dockerfile": "docker/dummy.Dockerfile"} mock_get_dockerfiles.return_value = ["docker/dummy.Dockerfile"] - mock_build_image.return_value = {"docker_image": "ci-dummy_dummy.ubuntu.amd", "build_duration": 1.0} - result = self.builder._build_model_single_arch(model_info, None, False, None, "", None) + mock_build_image.return_value = { + "docker_image": "ci-dummy_dummy.ubuntu.amd", + "build_duration": 1.0, + } + result = self.builder._build_model_single_arch( + model_info, None, False, None, "", None + ) assert result[0]["docker_image"] == "ci-dummy_dummy.ubuntu.amd" @patch.object(DockerBuilder, "_build_model_single_arch") def test_additional_context_overrides_target_archs(self, mock_single_arch): - self.context.ctx = {"docker_build_arg": {"MAD_SYSTEM_GPU_ARCHITECTURE": "gfx908"}} + self.context.ctx = { + "docker_build_arg": {"MAD_SYSTEM_GPU_ARCHITECTURE": "gfx908"} + } model_info = {"name": "dummy", "dockerfile": "docker/dummy.Dockerfile"} - mock_single_arch.return_value = [{"docker_image": "ci-dummy_dummy.ubuntu.amd", "build_duration": 1.0}] - result = self.builder.build_all_models([model_info], target_archs=["gfx908", "gfx90a"]) - assert result["successful_builds"][0]["docker_image"] == "ci-dummy_dummy.ubuntu.amd" + mock_single_arch.return_value = [ + {"docker_image": "ci-dummy_dummy.ubuntu.amd", "build_duration": 1.0} + ] + result = self.builder.build_all_models( + [model_info], target_archs=["gfx908", "gfx90a"] + ) + assert ( + result["successful_builds"][0]["docker_image"] + == "ci-dummy_dummy.ubuntu.amd" + ) def test_parse_dockerfile_gpu_variables(self): content = """ @@ -640,20 +688,41 @@ def test_parse_malformed_dockerfile(self): def test_normalize_architecture_name(self): cases = { - "gfx908": "gfx908", "GFX908": "gfx908", "mi100": "gfx908", "mi-100": "gfx908", - "mi200": "gfx90a", "mi-200": "gfx90a", "mi210": "gfx90a", "mi250": "gfx90a", - "mi300": "gfx940", "mi-300": "gfx940", "mi300a": "gfx940", - "mi300x": "gfx942", "mi-300x": "gfx942", "unknown": "unknown", "": None, + "gfx908": "gfx908", + "GFX908": "gfx908", + "mi100": "gfx908", + "mi-100": "gfx908", + "mi200": "gfx90a", + "mi-200": "gfx90a", + "mi210": "gfx90a", + "mi250": "gfx90a", + "mi300": "gfx940", + "mi-300": "gfx940", + "mi300a": "gfx940", + "mi300x": "gfx942", + "mi-300x": "gfx942", + "unknown": "unknown", + "": None, } for inp, expected in cases.items(): assert normalize_architecture_name(inp) == expected def test_is_target_arch_compatible_with_variable(self): - assert is_target_arch_compatible_with_variable("MAD_SYSTEM_GPU_ARCHITECTURE", ["gfx908"], "gfx942") - assert is_target_arch_compatible_with_variable("PYTORCH_ROCM_ARCH", ["gfx908", "gfx942"], "gfx942") - assert not is_target_arch_compatible_with_variable("PYTORCH_ROCM_ARCH", ["gfx908"], "gfx942") - assert is_target_arch_compatible_with_variable("GFX_COMPILATION_ARCH", ["gfx908"], "gfx908") - assert not is_target_arch_compatible_with_variable("GFX_COMPILATION_ARCH", ["gfx908"], "gfx942") + assert is_target_arch_compatible_with_variable( + "MAD_SYSTEM_GPU_ARCHITECTURE", ["gfx908"], "gfx942" + ) + assert is_target_arch_compatible_with_variable( + "PYTORCH_ROCM_ARCH", ["gfx908", "gfx942"], "gfx942" + ) + assert not is_target_arch_compatible_with_variable( + "PYTORCH_ROCM_ARCH", ["gfx908"], "gfx942" + ) + assert is_target_arch_compatible_with_variable( + "GFX_COMPILATION_ARCH", ["gfx908"], "gfx908" + ) + assert not is_target_arch_compatible_with_variable( + "GFX_COMPILATION_ARCH", ["gfx908"], "gfx942" + ) assert is_target_arch_compatible_with_variable("UNKNOWN_VAR", ["foo"], "bar") def test_is_compilation_arch_compatible(self): @@ -664,4 +733,3 @@ def test_is_compilation_arch_compatible(self): if __name__ == "__main__": pytest.main([__file__, "-v", "--tb=short"]) - diff --git a/tests/integration/test_profiling_tools_config.py b/tests/integration/test_profiling_tools_config.py index 7dc36f94..4659b608 100644 --- a/tests/integration/test_profiling_tools_config.py +++ b/tests/integration/test_profiling_tools_config.py @@ -27,7 +27,9 @@ def test_rocm_trace_lite_config_and_apply_tools(): assert cfg_default["env_vars"].get("RTL_MODE") == "default" assert cfg_default["cmd"] == cfg["cmd"] - wrapper = get_madengine_root() / "scripts" / "common" / "tools" / "rtl_trace_wrapper.sh" + wrapper = ( + get_madengine_root() / "scripts" / "common" / "tools" / "rtl_trace_wrapper.sh" + ) assert wrapper.is_file() ctx = MagicMock() diff --git a/tests/unit/test_additional_context_defaults.py b/tests/unit/test_additional_context_defaults.py index f42d9c2e..10964ec8 100644 --- a/tests/unit/test_additional_context_defaults.py +++ b/tests/unit/test_additional_context_defaults.py @@ -1,8 +1,8 @@ """Tests for madengine.core.additional_context_defaults.""" from madengine.core.additional_context_defaults import ( - DEFAULT_GUEST_OS, DEFAULT_GPU_VENDOR, + DEFAULT_GUEST_OS, apply_build_context_defaults, ) diff --git a/tests/unit/test_auth.py b/tests/unit/test_auth.py index 47d4a6c7..1ce039dc 100644 --- a/tests/unit/test_auth.py +++ b/tests/unit/test_auth.py @@ -1,9 +1,10 @@ """Unit tests for madengine.core.auth module.""" import os -import pytest from unittest.mock import MagicMock, mock_open, patch +import pytest + from madengine.core.auth import load_credentials, login_to_registry @@ -134,14 +135,22 @@ def test_missing_registry_key_raises_when_raise_on_failure(self): console, rich_console = self._mocks() credentials = {"other_registry": {"username": "u", "password": "p"}} with pytest.raises(RuntimeError, match="myregistry.io"): - login_to_registry("myregistry.io", credentials, console, rich_console, raise_on_failure=True) + login_to_registry( + "myregistry.io", + credentials, + console, + rich_console, + raise_on_failure=True, + ) console.sh.assert_not_called() def test_missing_registry_key_returns_when_not_raise_on_failure(self): """Returns silently when registry key absent and raise_on_failure=False.""" console, rich_console = self._mocks() credentials = {"other_registry": {"username": "u", "password": "p"}} - login_to_registry("myregistry.io", credentials, console, rich_console, raise_on_failure=False) + login_to_registry( + "myregistry.io", credentials, console, rich_console, raise_on_failure=False + ) console.sh.assert_not_called() def test_invalid_credentials_format_raises(self): @@ -149,14 +158,18 @@ def test_invalid_credentials_format_raises(self): console, rich_console = self._mocks() credentials = {"dockerhub": {"token": "abc"}} with pytest.raises(RuntimeError, match="username|password"): - login_to_registry("docker.io", credentials, console, rich_console, raise_on_failure=True) + login_to_registry( + "docker.io", credentials, console, rich_console, raise_on_failure=True + ) console.sh.assert_not_called() def test_invalid_credentials_format_returns_when_not_raise_on_failure(self): """Returns silently when credentials format invalid and raise_on_failure=False.""" console, rich_console = self._mocks() credentials = {"dockerhub": {"token": "abc"}} - login_to_registry("docker.io", credentials, console, rich_console, raise_on_failure=False) + login_to_registry( + "docker.io", credentials, console, rich_console, raise_on_failure=False + ) console.sh.assert_not_called() def test_docker_io_normalised_to_dockerhub(self): @@ -184,12 +197,16 @@ def test_login_failure_raises_when_raise_on_failure(self): console.sh.side_effect = RuntimeError("auth failed") credentials = {"dockerhub": {"username": "user", "password": "pass"}} with pytest.raises(RuntimeError, match="auth failed"): - login_to_registry(None, credentials, console, rich_console, raise_on_failure=True) + login_to_registry( + None, credentials, console, rich_console, raise_on_failure=True + ) def test_login_failure_suppressed_when_not_raise_on_failure(self): """docker login error is suppressed when raise_on_failure=False.""" console, rich_console = self._mocks() console.sh.side_effect = RuntimeError("auth failed") credentials = {"dockerhub": {"username": "user", "password": "pass"}} - login_to_registry(None, credentials, console, rich_console, raise_on_failure=False) + login_to_registry( + None, credentials, console, rich_console, raise_on_failure=False + ) # Should not propagate the exception diff --git a/tests/unit/test_cli.py b/tests/unit/test_cli.py index 164ac7a5..bbed4a71 100644 --- a/tests/unit/test_cli.py +++ b/tests/unit/test_cli.py @@ -16,8 +16,8 @@ import importlib import json import os -from io import StringIO import tempfile +from io import StringIO from unittest.mock import MagicMock, patch # third-party modules @@ -28,34 +28,34 @@ # project modules from madengine.cli import ( - app, - setup_logging, - create_args_namespace, - validate_additional_context, - save_summary_with_feedback, - display_results_table, - ExitCode, - VALID_GPU_VENDORS, - VALID_GUEST_OS, + DEFAULT_DATA_CONFIG, DEFAULT_MANIFEST_FILE, DEFAULT_PERF_OUTPUT, - DEFAULT_DATA_CONFIG, - DEFAULT_TOOLS_CONFIG, DEFAULT_TIMEOUT, + DEFAULT_TOOLS_CONFIG, + VALID_GPU_VENDORS, + VALID_GUEST_OS, + ExitCode, + app, + create_args_namespace, + display_results_table, + save_summary_with_feedback, + setup_logging, + validate_additional_context, ) from tests.fixtures.utils import ( BASE_DIR, MODEL_DIR, + generate_additional_context_for_machine, has_gpu, requires_gpu, - generate_additional_context_for_machine, ) - # ============================================================================ # CLI Utilities Tests # ============================================================================ + class TestSetupLogging: """Test the setup_logging function.""" @@ -159,8 +159,16 @@ def test_display_results_table_build_shows_gpu_arch_from_docker_builder(self): """Multi-arch builds record gpu_architecture; table must show it, not N/A.""" summary = { "successful_builds": [ - {"model": "dummy", "docker_image": "ci-dummy_dummy.ubuntu.amd_gfx90a", "gpu_architecture": "gfx90a"}, - {"model": "dummy", "docker_image": "ci-dummy_dummy.ubuntu.amd_gfx942", "gpu_architecture": "gfx942"}, + { + "model": "dummy", + "docker_image": "ci-dummy_dummy.ubuntu.amd_gfx90a", + "gpu_architecture": "gfx90a", + }, + { + "model": "dummy", + "docker_image": "ci-dummy_dummy.ubuntu.amd_gfx942", + "gpu_architecture": "gfx942", + }, ], "failed_builds": [], } @@ -209,6 +217,7 @@ def test_display_results_table_run_results(self): # CLI Validation Tests # ============================================================================ + class TestValidateAdditionalContext: """Test the validate_additional_context function.""" @@ -283,20 +292,20 @@ class TestProcessBatchManifest: def test_process_batch_manifest_valid_mixed_build_new(self): """Test processing batch manifest with mixed build_new values - core functionality.""" from madengine.cli.validators import process_batch_manifest - + batch_data = [ {"model_name": "model1", "build_new": True}, {"model_name": "model2", "build_new": False}, {"model_name": "model3", "build_new": True}, ] - + with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f: json.dump(batch_data, f) temp_file = f.name - + try: result = process_batch_manifest(temp_file) - + # Only models with build_new=True should be in build_tags assert result["build_tags"] == ["model1", "model3"] # All models should be in all_tags @@ -308,19 +317,19 @@ def test_process_batch_manifest_valid_mixed_build_new(self): def test_process_batch_manifest_default_build_new_false(self): """Test that build_new defaults to false when not specified.""" from madengine.cli.validators import process_batch_manifest - + batch_data = [ {"model_name": "model1"}, # No build_new field {"model_name": "model2", "build_new": True}, ] - + with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f: json.dump(batch_data, f) temp_file = f.name - + try: result = process_batch_manifest(temp_file) - + # model1 should not be in build_tags (defaults to false) assert result["build_tags"] == ["model2"] assert result["all_tags"] == ["model1", "model2"] @@ -330,28 +339,24 @@ def test_process_batch_manifest_default_build_new_false(self): def test_process_batch_manifest_with_registry_fields(self): """Test per-model registry override - key feature.""" from madengine.cli.validators import process_batch_manifest - + batch_data = [ { "model_name": "model1", "build_new": True, "registry": "docker.io/myorg", - "registry_image": "myorg/model1" - }, - { - "model_name": "model2", - "build_new": True, - "registry": "gcr.io/myproject" + "registry_image": "myorg/model1", }, + {"model_name": "model2", "build_new": True, "registry": "gcr.io/myproject"}, ] - + with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f: json.dump(batch_data, f) temp_file = f.name - + try: result = process_batch_manifest(temp_file) - + # Verify registry metadata is preserved assert result["manifest_data"][0]["registry"] == "docker.io/myorg" assert result["manifest_data"][0]["registry_image"] == "myorg/model1" @@ -362,17 +367,17 @@ def test_process_batch_manifest_with_registry_fields(self): def test_process_batch_manifest_error_handling(self): """Test error handling for various invalid inputs.""" from madengine.cli.validators import process_batch_manifest - + # File not found with pytest.raises(FileNotFoundError) as exc_info: process_batch_manifest("non_existent_file.json") assert "Batch manifest file not found" in str(exc_info.value) - + # Invalid JSON with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f: f.write("invalid json content{") temp_file = f.name - + try: with pytest.raises(ValueError) as exc_info: process_batch_manifest(temp_file) @@ -383,26 +388,26 @@ def test_process_batch_manifest_error_handling(self): def test_process_batch_manifest_validation(self): """Test validation rules for batch manifest.""" from madengine.cli.validators import process_batch_manifest - + # Not a list batch_data = {"model_name": "model1", "build_new": True} with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f: json.dump(batch_data, f) temp_file = f.name - + try: with pytest.raises(ValueError) as exc_info: process_batch_manifest(temp_file) assert "must be a list" in str(exc_info.value) finally: os.unlink(temp_file) - + # Missing model_name batch_data = [{"build_new": True}] with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f: json.dump(batch_data, f) temp_file = f.name - + try: with pytest.raises(ValueError) as exc_info: process_batch_manifest(temp_file) @@ -415,6 +420,7 @@ def test_process_batch_manifest_validation(self): # CLI exit code and error handling tests (CI / Jenkins smoke) # ============================================================================ + @pytest.fixture def runner() -> CliRunner: return CliRunner() diff --git a/tests/unit/test_config_integration.py b/tests/unit/test_config_integration.py new file mode 100644 index 00000000..fdf32bd8 --- /dev/null +++ b/tests/unit/test_config_integration.py @@ -0,0 +1,95 @@ +#!/usr/bin/env python3 +"""Integration tests for load_config end-to-end pipeline.""" + +from pathlib import Path + +import pytest + +from madengine.config import load_config +from madengine.core.errors import ConfigurationError + +FIXTURES_DIR = Path(__file__).parent.parent / "fixtures" / "configs" + + +class TestLoadConfigEndToEnd: + def test_defaults_produce_valid_context(self): + ctx, meta = load_config([]) + assert ctx["gpu_vendor"] == "AMD" + assert ctx["guest_os"] == "UBUNTU" + assert meta["model"]["tags"] == [] + + def test_scheduler_slurm(self): + ctx, meta = load_config(["scheduler=slurm"]) + assert "slurm" in ctx + assert ctx["slurm"]["partition"] == "amd-rccl" + + def test_launcher_torchrun(self): + ctx, meta = load_config(["launcher=torchrun"]) + assert ctx["distributed"]["enabled"] is True + assert ctx["distributed"]["launcher"] == "torchrun" + + def test_combined_overrides(self): + ctx, meta = load_config( + [ + "scheduler=slurm", + "launcher=torchrun", + "distributed.nnodes=4", + "+env=nccl_debug", + ] + ) + assert ctx["distributed"]["nnodes"] == 4 + assert ctx["env_vars"]["NCCL_DEBUG"] == "INFO" + assert "slurm" in ctx + + def test_user_yaml_file(self): + yaml_path = str(FIXTURES_DIR / "test_slurm_job.yaml") + ctx, meta = load_config([yaml_path]) + assert meta["model"]["tags"] == ["dummy"] + assert ctx["slurm"]["partition"] == "test-partition" + assert ctx["distributed"]["nnodes"] == 2 + assert ctx["env_vars"]["MY_VAR"] == "test_value" + assert ctx["debug"] is True + + def test_user_yaml_with_override(self): + # User YAML is merged last (highest priority). Overrides for keys present in + # the user YAML are overwritten by it; new keys added via '+' syntax survive. + yaml_path = str(FIXTURES_DIR / "test_slurm_job.yaml") + ctx, meta = load_config([yaml_path, "+env_vars.EXTRA_VAR=hello"]) + # MY_VAR from user YAML is preserved + assert ctx["env_vars"]["MY_VAR"] == "test_value" + # EXTRA_VAR added via '+' override is also present + assert ctx["env_vars"]["EXTRA_VAR"] == "hello" + + def test_docker_keys_translated(self): + # Appending to an empty dict in Hydra requires the '+' prefix + ctx, meta = load_config(["+docker.build_args.KEY=val"]) + assert ctx["docker_build_arg"]["KEY"] == "val" + + def test_slurm_and_k8s_conflict_raises(self): + # scheduler=slurm adds 'slurm' section; +k8s.namespace appends 'k8s' section. + # Validator detects both and raises ConfigurationError. + with pytest.raises(ConfigurationError, match="Cannot specify both"): + load_config(["scheduler=slurm", "+k8s.namespace=test"]) + + def test_unsupported_platform_raises(self): + with pytest.raises(ConfigurationError, match="not yet supported"): + load_config(["platform=bare_metal"]) + + def test_container_image_promoted(self): + ctx, meta = load_config(["model.container_image=myimage:latest"]) + assert ctx["MAD_CONTAINER_IMAGE"] == "myimage:latest" + + def test_model_tags_in_metadata(self): + ctx, meta = load_config(["model.tags=[dummy,bert]"]) + assert meta["model"]["tags"] == ["dummy", "bert"] + assert "model" not in ctx + + def test_profile_append(self): + ctx, meta = load_config(["+profile=mi300x_8gpu"]) + assert ctx["gpu_type"] == "mi300x" + assert ctx["env_vars"]["HSA_ENABLE_SDMA"] == "0" + + def test_tools_append(self): + ctx, meta = load_config(["+tools=rocprofv3_lightweight"]) + assert len(ctx["tools"]) == 1 + assert ctx["tools"][0]["name"] == "rocprofv3_lightweight" diff --git a/tests/unit/test_config_loader.py b/tests/unit/test_config_loader.py index 65184e68..5ca27829 100644 --- a/tests/unit/test_config_loader.py +++ b/tests/unit/test_config_loader.py @@ -14,9 +14,9 @@ """ import json -import pytest from pathlib import Path +import pytest from jinja2 import Template from madengine.deployment.config_loader import ConfigLoader, apply_deployment_config @@ -42,24 +42,20 @@ def load_config_file(relative_path): full_path = get_project_root() / relative_path if not full_path.exists(): pytest.skip(f"Config file not found: {relative_path}") - + with open(full_path) as f: return json.load(f) class TestConfigLoaderBasics: """Test basic ConfigLoader functionality.""" - + def test_minimal_single_gpu(self): """Test minimal single GPU config gets proper defaults.""" - user_config = { - "k8s": { - "gpu_count": 1 - } - } - + user_config = {"k8s": {"gpu_count": 1}} + result = ConfigLoader.load_k8s_config(user_config) - + # Validate defaults applied assert result["k8s"]["gpu_count"] == 1 assert result["k8s"]["memory"] == "16Gi" @@ -67,22 +63,16 @@ def test_minimal_single_gpu(self): assert result["k8s"]["namespace"] == "default" assert result["gpu_vendor"] == "AMD" assert "OMP_NUM_THREADS" in result["env_vars"] - + def test_minimal_multi_gpu(self): """Test minimal multi-GPU config gets proper defaults.""" user_config = { - "k8s": { - "gpu_count": 2 - }, - "distributed": { - "launcher": "torchrun", - "nnodes": 1, - "nproc_per_node": 2 - } + "k8s": {"gpu_count": 2}, + "distributed": {"launcher": "torchrun", "nnodes": 1, "nproc_per_node": 2}, } - + result = ConfigLoader.load_k8s_config(user_config) - + # Validate multi-GPU defaults assert result["k8s"]["gpu_count"] == 2 assert result["k8s"]["memory"] == "64Gi" @@ -91,63 +81,49 @@ def test_minimal_multi_gpu(self): assert result["env_vars"]["NCCL_DEBUG"] == "WARN" assert "MIOPEN_FIND_MODE" in result["env_vars"] assert result["distributed"]["backend"] == "nccl" - + def test_minimal_multi_node(self): """Test minimal multi-node config gets proper defaults.""" user_config = { - "k8s": { - "gpu_count": 2 - }, - "distributed": { - "launcher": "torchrun", - "nnodes": 2, - "nproc_per_node": 2 - } + "k8s": {"gpu_count": 2}, + "distributed": {"launcher": "torchrun", "nnodes": 2, "nproc_per_node": 2}, } - + result = ConfigLoader.load_k8s_config(user_config) - + # Validate multi-node defaults assert result["k8s"]["host_ipc"] == True assert "NCCL_DEBUG_SUBSYS" in result["env_vars"] assert "NCCL_TIMEOUT" in result["env_vars"] - + def test_nvidia_config(self): """Test NVIDIA GPU config gets proper defaults.""" user_config = { "gpu_vendor": "NVIDIA", - "k8s": { - "gpu_count": 4 - }, - "distributed": { - "launcher": "torchrun", - "nnodes": 1, - "nproc_per_node": 4 - } + "k8s": {"gpu_count": 4}, + "distributed": {"launcher": "torchrun", "nnodes": 1, "nproc_per_node": 4}, } - + result = ConfigLoader.load_k8s_config(user_config) - + # Validate NVIDIA defaults assert result["k8s"]["gpu_resource_name"] == "nvidia.com/gpu" assert "NCCL_P2P_DISABLE" in result["env_vars"] assert result["env_vars"]["OMP_NUM_THREADS"] == "12" - + def test_override_behavior(self): """Test that user overrides work correctly.""" user_config = { "k8s": { "gpu_count": 1, "namespace": "custom-namespace", - "memory": "32Gi" # Override default 16Gi + "memory": "32Gi", # Override default 16Gi }, - "env_vars": { - "CUSTOM_VAR": "custom_value" - } + "env_vars": {"CUSTOM_VAR": "custom_value"}, } - + result = ConfigLoader.load_k8s_config(user_config) - + # Validate overrides assert result["k8s"]["namespace"] == "custom-namespace" assert result["k8s"]["memory"] == "32Gi" # Overridden @@ -161,8 +137,10 @@ class TestApplyDeploymentConfig: def test_apply_slurm_config_mutates_and_returns(self): """apply_deployment_config mutates config.additional_context and returns full config.""" + class FakeConfig: additional_context = {"slurm": {"nodes": 2}} + config = FakeConfig() result = apply_deployment_config(config, ConfigLoader.load_slurm_config) assert result is config.additional_context @@ -171,8 +149,10 @@ class FakeConfig: def test_apply_k8s_config_mutates_and_returns(self): """apply_deployment_config with load_k8s_config mutates and returns full config.""" + class FakeConfig: additional_context = {"k8s": {"gpu_count": 1}} + config = FakeConfig() result = apply_deployment_config(config, ConfigLoader.load_k8s_config) assert result is config.additional_context @@ -182,31 +162,39 @@ class FakeConfig: class TestConfigLoaderK8sConfigs: """Test with actual K8s config files (if they exist).""" - + @pytest.mark.skipif( - not config_exists("examples/k8s-configs/basic/01-native-single-node-single-gpu.json"), - reason="K8s config file not found" + not config_exists( + "examples/k8s-configs/basic/01-native-single-node-single-gpu.json" + ), + reason="K8s config file not found", ) def test_k8s_single_gpu_config(self): """Test K8s single GPU config file.""" - user_config = load_config_file("examples/k8s-configs/basic/01-native-single-node-single-gpu.json") + user_config = load_config_file( + "examples/k8s-configs/basic/01-native-single-node-single-gpu.json" + ) result = ConfigLoader.load_k8s_config(user_config) - + # Validate key fields are preserved assert result["k8s"]["gpu_count"] == 1 assert "memory" in result["k8s"] assert "namespace" in result["k8s"] assert result["gpu_vendor"] in ["AMD", "NVIDIA"] - + @pytest.mark.skipif( - not config_exists("examples/k8s-configs/basic/02-torchrun-single-node-multi-gpu.json"), - reason="K8s multi-GPU config file not found" + not config_exists( + "examples/k8s-configs/basic/02-torchrun-single-node-multi-gpu.json" + ), + reason="K8s multi-GPU config file not found", ) def test_k8s_multi_gpu_config(self): """Test K8s multi-GPU config file.""" - user_config = load_config_file("examples/k8s-configs/basic/02-torchrun-single-node-multi-gpu.json") + user_config = load_config_file( + "examples/k8s-configs/basic/02-torchrun-single-node-multi-gpu.json" + ) result = ConfigLoader.load_k8s_config(user_config) - + # Validate multi-GPU config assert result["k8s"]["gpu_count"] >= 2 assert "distributed" in result @@ -216,36 +204,42 @@ def test_k8s_multi_gpu_config(self): class TestConfigLoaderSlurmConfigs: """Test with actual SLURM config files (if they exist).""" - + @pytest.mark.skipif( - not config_exists("examples/slurm-configs/basic/01-single-node-single-gpu.json"), - reason="SLURM config file not found" + not config_exists( + "examples/slurm-configs/basic/01-single-node-single-gpu.json" + ), + reason="SLURM config file not found", ) def test_slurm_single_gpu_config(self): """Test SLURM single GPU config file.""" - user_config = load_config_file("examples/slurm-configs/basic/01-single-node-single-gpu.json") + user_config = load_config_file( + "examples/slurm-configs/basic/01-single-node-single-gpu.json" + ) result = ConfigLoader.load_slurm_config(user_config) - + # Validate SLURM config structure assert "slurm" in result assert result["slurm"]["nodes"] == 1 assert result["slurm"]["gpus_per_node"] >= 1 - + @pytest.mark.skipif( not config_exists("examples/slurm-configs/basic/06-vllm-multi-node.json"), - reason="SLURM vLLM multi-node config file not found" + reason="SLURM vLLM multi-node config file not found", ) def test_slurm_vllm_multi_node_config(self): """Test SLURM vLLM multi-node config file.""" - user_config = load_config_file("examples/slurm-configs/basic/06-vllm-multi-node.json") + user_config = load_config_file( + "examples/slurm-configs/basic/06-vllm-multi-node.json" + ) result = ConfigLoader.load_slurm_config(user_config) - + # Validate multi-node vLLM config assert "slurm" in result assert result["slurm"]["nodes"] >= 2 assert result["slurm"]["gpus_per_node"] >= 1 assert "distributed" in result - + # Check for new preflight node check parameters if "enable_node_check" in result["slurm"]: assert isinstance(result["slurm"]["enable_node_check"], bool) @@ -290,88 +284,66 @@ def test_job_script_includes_nodelist_when_set(self): class TestConfigLoaderDeploymentType: """Test deployment type inference and validation.""" - + def test_auto_infer_k8s(self): """Test k8s deployment type is auto-inferred from k8s field presence.""" - user_config = { - "k8s": { - "gpu_count": 1 - } - } - + user_config = {"k8s": {"gpu_count": 1}} + result = ConfigLoader.load_config(user_config) - + # Validate k8s config was loaded and defaults applied assert "k8s" in result assert result["k8s"]["gpu_count"] == 1 assert "memory" in result["k8s"] # Default was applied - + def test_auto_infer_slurm(self): """Test slurm deployment type is auto-inferred from slurm field presence.""" - user_config = { - "slurm": { - "nodes": 1, - "gpus_per_node": 4 - } - } - + user_config = {"slurm": {"nodes": 1, "gpus_per_node": 4}} + result = ConfigLoader.load_config(user_config) - + # Validate slurm config was loaded and defaults applied assert "slurm" in result assert result["slurm"]["nodes"] == 1 assert result["slurm"]["gpus_per_node"] == 4 - + def test_auto_infer_local(self): """Test local deployment when no k8s/slurm present.""" - user_config = { - "env_vars": {"MY_VAR": "value"} - } - + user_config = {"env_vars": {"MY_VAR": "value"}} + result = ConfigLoader.load_config(user_config) - + # Validate local config (no k8s or slurm fields) assert "k8s" not in result or result.get("k8s") == {} assert "slurm" not in result or result.get("slurm") == {} assert result["env_vars"]["MY_VAR"] == "value" - + def test_conflict_k8s_and_slurm(self): """Test error when both k8s and slurm fields present.""" - user_config = { - "k8s": {"gpu_count": 1}, - "slurm": {"nodes": 2} - } - + user_config = {"k8s": {"gpu_count": 1}, "slurm": {"nodes": 2}} + with pytest.raises(ValueError, match="Both 'k8s' and 'slurm'"): ConfigLoader.load_config(user_config) - + def test_conflict_explicit_deploy_mismatch(self): """Test error when explicit deploy field conflicts with config presence.""" - user_config = { - "deploy": "slurm", - "k8s": {"gpu_count": 1} - } - + user_config = {"deploy": "slurm", "k8s": {"gpu_count": 1}} + with pytest.raises(ValueError, match="Conflicting deployment"): ConfigLoader.load_config(user_config) - + def test_explicit_deploy_matching(self): """Test that explicit deploy field works when it matches config.""" - user_config = { - "deploy": "k8s", - "k8s": {"gpu_count": 1} - } - + user_config = {"deploy": "k8s", "k8s": {"gpu_count": 1}} + result = ConfigLoader.load_config(user_config) - + # Should work fine since deploy matches k8s presence # The deploy field may or may not be preserved in result assert result["k8s"]["gpu_count"] == 1 assert "memory" in result["k8s"] # Defaults applied - # Run pytest if executed directly if __name__ == "__main__": pytest.main([__file__, "-v", "-s"]) - diff --git a/tests/unit/test_config_schema.py b/tests/unit/test_config_schema.py new file mode 100644 index 00000000..ab6ad103 --- /dev/null +++ b/tests/unit/test_config_schema.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python3 +"""Tests for ConfigValidator.""" + +from omegaconf import DictConfig, OmegaConf + +from madengine.config.schema import ConfigValidator + + +def make_cfg(data: dict) -> DictConfig: + return OmegaConf.create(data) + + +class TestConflictDetection: + def test_slurm_and_k8s_conflict(self): + cfg = make_cfg({"slurm": {"partition": "gpu"}, "k8s": {"namespace": "default"}}) + errors = ConfigValidator.validate(cfg) + assert any("Cannot specify both" in e for e in errors) + + def test_slurm_only_no_conflict(self): + cfg = make_cfg({"slurm": {"partition": "gpu"}}) + errors = ConfigValidator.validate(cfg) + assert not any("Cannot specify both" in e for e in errors) + + def test_k8s_only_no_conflict(self): + cfg = make_cfg({"k8s": {"namespace": "default"}}) + errors = ConfigValidator.validate(cfg) + assert not any("Cannot specify both" in e for e in errors) + + +class TestDistributedValidation: + def test_enabled_without_launcher(self): + cfg = make_cfg({"distributed": {"enabled": True}}) + errors = ConfigValidator.validate(cfg) + assert any("requires distributed.launcher" in e for e in errors) + + def test_enabled_with_launcher(self): + cfg = make_cfg({"distributed": {"enabled": True, "launcher": "torchrun"}}) + errors = ConfigValidator.validate(cfg) + assert not any("requires distributed.launcher" in e for e in errors) + + def test_invalid_nnodes(self): + cfg = make_cfg( + {"distributed": {"enabled": True, "launcher": "torchrun", "nnodes": -1}} + ) + errors = ConfigValidator.validate(cfg) + assert any("positive integer" in e for e in errors) + + def test_valid_nnodes(self): + cfg = make_cfg( + {"distributed": {"enabled": True, "launcher": "torchrun", "nnodes": 4}} + ) + errors = ConfigValidator.validate(cfg) + assert not any("positive integer" in e for e in errors) + + +class TestUnknownKeys: + def test_unknown_top_level_key(self): + cfg = make_cfg({"gpu_vendor": "AMD", "typo_key": "oops"}) + errors = ConfigValidator.validate(cfg) + assert any("Unknown config key: 'typo_key'" in e for e in errors) + + def test_known_keys_accepted(self): + cfg = make_cfg({"gpu_vendor": "AMD", "debug": True, "env_vars": {}}) + errors = ConfigValidator.validate(cfg) + assert not any("Unknown config key" in e for e in errors) + + +class TestPlatformValidation: + def test_unsupported_platform(self): + cfg = make_cfg({"platform": {"type": "bare_metal"}}) + errors = ConfigValidator.validate(cfg) + assert any("not yet supported" in e for e in errors) + + def test_docker_platform_ok(self): + cfg = make_cfg({"platform": {"type": "docker"}}) + errors = ConfigValidator.validate(cfg) + assert not any("not yet supported" in e for e in errors) diff --git a/tests/unit/test_config_translator.py b/tests/unit/test_config_translator.py new file mode 100644 index 00000000..7d73fc87 --- /dev/null +++ b/tests/unit/test_config_translator.py @@ -0,0 +1,236 @@ +#!/usr/bin/env python3 +"""Tests for ConfigTranslator.""" + +from omegaconf import DictConfig, OmegaConf + +from madengine.config.translator import ConfigTranslator + + +def make_cfg(overrides: dict) -> DictConfig: + """Build a DictConfig from a base + overrides for testing.""" + base = { + "model": { + "tags": [], + "manifest_file": None, + "container_image": None, + "skip_run": False, + "timeout": None, + }, + "docker": { + "build_args": {}, + "env_vars": {}, + "mounts": {}, + "gpus": None, + "cpus": None, + "additional_run_options": None, + "keep_alive": False, + "clean_cache": False, + }, + "build": { + "registry": None, + "target_archs": [], + "manifest_output": "build_manifest.json", + }, + "env_vars": {}, + "debug": False, + "live_output": False, + "log_error": {"pattern_scan": True, "benign_patterns": [], "patterns": []}, + "tools": [], + "pre_scripts": [], + "post_scripts": [], + "encapsulate_script": None, + "data_config": "data.json", + "output": "perf.csv", + "summary_output": None, + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + "runtime": { + "devices": [], + "capabilities": [], + "security_opts": [], + "network_mode": "host", + "ipc": "host", + "groups": [], + "use_gpu_flag": False, + }, + "platform": {"type": "docker"}, + } + merged = {**base, **overrides} + return OmegaConf.create(merged) + + +class TestDockerKeyMapping: + def test_build_args_mapped(self): + cfg = make_cfg( + { + "docker": { + "build_args": {"KEY": "val"}, + "env_vars": {}, + "mounts": {}, + "gpus": None, + "cpus": None, + "additional_run_options": None, + "keep_alive": False, + "clean_cache": False, + } + } + ) + ctx, meta = ConfigTranslator.to_additional_context(cfg) + assert ctx["docker_build_arg"] == {"KEY": "val"} + + def test_env_vars_mapped(self): + cfg = make_cfg( + { + "docker": { + "build_args": {}, + "env_vars": {"A": "1"}, + "mounts": {}, + "gpus": None, + "cpus": None, + "additional_run_options": None, + "keep_alive": False, + "clean_cache": False, + } + } + ) + ctx, meta = ConfigTranslator.to_additional_context(cfg) + assert ctx["docker_env_vars"] == {"A": "1"} + + def test_null_gpus_excluded(self): + cfg = make_cfg({}) + ctx, meta = ConfigTranslator.to_additional_context(cfg) + assert "docker_gpus" not in ctx + + def test_non_null_gpus_included(self): + cfg = make_cfg( + { + "docker": { + "build_args": {}, + "env_vars": {}, + "mounts": {}, + "gpus": "0-3", + "cpus": None, + "additional_run_options": None, + "keep_alive": False, + "clean_cache": False, + } + } + ) + ctx, meta = ConfigTranslator.to_additional_context(cfg) + assert ctx["docker_gpus"] == "0-3" + + +class TestLogErrorMapping: + def test_pattern_scan_mapped(self): + cfg = make_cfg( + { + "log_error": { + "pattern_scan": False, + "benign_patterns": [], + "patterns": [], + } + } + ) + ctx, meta = ConfigTranslator.to_additional_context(cfg) + assert ctx["log_error_pattern_scan"] is False + + def test_patterns_mapped(self): + cfg = make_cfg( + { + "log_error": { + "pattern_scan": True, + "benign_patterns": ["OK"], + "patterns": ["ERR"], + } + } + ) + ctx, meta = ConfigTranslator.to_additional_context(cfg) + assert ctx["log_error_benign_patterns"] == ["OK"] + assert ctx["log_error_patterns"] == ["ERR"] + + +class TestPassthroughKeys: + def test_gpu_vendor_passthrough(self): + cfg = make_cfg({"gpu_vendor": "NVIDIA"}) + ctx, meta = ConfigTranslator.to_additional_context(cfg) + assert ctx["gpu_vendor"] == "NVIDIA" + + def test_env_vars_passthrough(self): + cfg = make_cfg({"env_vars": {"MY": "VAR"}}) + ctx, meta = ConfigTranslator.to_additional_context(cfg) + assert ctx["env_vars"] == {"MY": "VAR"} + + def test_slurm_passthrough(self): + cfg = make_cfg({"slurm": {"partition": "gpu"}}) + ctx, meta = ConfigTranslator.to_additional_context(cfg) + assert ctx["slurm"] == {"partition": "gpu"} + + def test_distributed_passthrough(self): + cfg = make_cfg({"distributed": {"enabled": True, "launcher": "torchrun"}}) + ctx, meta = ConfigTranslator.to_additional_context(cfg) + assert ctx["distributed"]["launcher"] == "torchrun" + + def test_tools_passthrough(self): + cfg = make_cfg({"tools": [{"name": "rpd"}]}) + ctx, meta = ConfigTranslator.to_additional_context(cfg) + assert ctx["tools"] == [{"name": "rpd"}] + + +class TestExtractedKeys: + def test_model_extracted(self): + cfg = make_cfg( + { + "model": { + "tags": ["dummy"], + "manifest_file": None, + "container_image": None, + "skip_run": False, + "timeout": 300, + } + } + ) + ctx, meta = ConfigTranslator.to_additional_context(cfg) + assert "model" not in ctx + assert meta["model"]["tags"] == ["dummy"] + assert meta["model"]["timeout"] == 300 + + def test_build_extracted(self): + cfg = make_cfg( + { + "build": { + "registry": "myregistry.io", + "target_archs": ["gfx942"], + "manifest_output": "build_manifest.json", + } + } + ) + ctx, meta = ConfigTranslator.to_additional_context(cfg) + assert "build" not in ctx + assert meta["build"]["registry"] == "myregistry.io" + + def test_platform_extracted(self): + cfg = make_cfg({}) + ctx, meta = ConfigTranslator.to_additional_context(cfg) + assert "platform" not in ctx + assert meta["platform"]["type"] == "docker" + + def test_container_image_promoted(self): + cfg = make_cfg( + { + "model": { + "tags": [], + "manifest_file": None, + "container_image": "myimage:latest", + "skip_run": False, + "timeout": None, + } + } + ) + ctx, meta = ConfigTranslator.to_additional_context(cfg) + assert ctx["MAD_CONTAINER_IMAGE"] == "myimage:latest" + + def test_runtime_extracted(self): + cfg = make_cfg({}) + ctx, meta = ConfigTranslator.to_additional_context(cfg) + assert "runtime" not in ctx + assert "runtime" in meta diff --git a/tests/unit/test_constants.py b/tests/unit/test_constants.py index 248c8ac9..12d724a3 100644 --- a/tests/unit/test_constants.py +++ b/tests/unit/test_constants.py @@ -5,15 +5,15 @@ from unittest.mock import patch from madengine.core.constants import ( - NAS_NODES, + _DEFAULT_MAD_AWS_S3, + _DEFAULT_MAD_MINIO, + _DEFAULT_NAS_NODES, + _DEFAULT_PUBLIC_GITHUB_ROCM_KEY, MAD_AWS_S3, MAD_MINIO, + NAS_NODES, PUBLIC_GITHUB_ROCM_KEY, _get_env_or_creds_or_default, - _DEFAULT_NAS_NODES, - _DEFAULT_MAD_AWS_S3, - _DEFAULT_MAD_MINIO, - _DEFAULT_PUBLIC_GITHUB_ROCM_KEY, ) @@ -25,6 +25,7 @@ def test_env_override_returns_parsed_json(self): with patch.dict(os.environ, {"TEST_KEY": '[{"a": 1}]'}, clear=False): # Need to pass creds - we patch CREDS via the module import madengine.core.constants as constants_module + with patch.object(constants_module, "CREDS", {}): result = _get_env_or_creds_or_default( "TEST_KEY", "TEST_KEY", _DEFAULT_NAS_NODES @@ -35,6 +36,7 @@ def test_env_invalid_json_returns_default(self): """When env is set with invalid JSON, default is returned.""" with patch.dict(os.environ, {"TEST_KEY": "not json"}, clear=False): import madengine.core.constants as constants_module + with patch.object(constants_module, "CREDS", {}): result = _get_env_or_creds_or_default( "TEST_KEY", "TEST_KEY", _DEFAULT_NAS_NODES @@ -51,6 +53,7 @@ def test_creds_fallback_when_env_unset(self): except Exception: pass import madengine.core.constants as constants_module + with patch.object(constants_module, "CREDS", {"TEST_KEY": [{"from": "creds"}]}): result = _get_env_or_creds_or_default( "TEST_KEY", "TEST_KEY", _DEFAULT_NAS_NODES @@ -60,6 +63,7 @@ def test_creds_fallback_when_env_unset(self): def test_default_when_env_and_creds_unset(self): """When env unset and creds missing key, default is returned.""" import madengine.core.constants as constants_module + with patch.dict(os.environ, {}, clear=False): if "TEST_KEY" in os.environ: del os.environ["TEST_KEY"] @@ -98,6 +102,7 @@ def test_mad_minio_has_expected_keys(self): def test_public_github_rocm_key_has_expected_keys(self): """PUBLIC_GITHUB_ROCM_KEY has username and token (no value assert to avoid leaking secrets).""" assert isinstance(PUBLIC_GITHUB_ROCM_KEY, dict) - assert set(PUBLIC_GITHUB_ROCM_KEY.keys()) >= {"username", "token"}, ( - "PUBLIC_GITHUB_ROCM_KEY must have at least keys 'username' and 'token'" - ) + assert set(PUBLIC_GITHUB_ROCM_KEY.keys()) >= { + "username", + "token", + }, "PUBLIC_GITHUB_ROCM_KEY must have at least keys 'username' and 'token'" diff --git a/tests/unit/test_container_runner.py b/tests/unit/test_container_runner.py index 2c10cbe7..8b0cbec0 100644 --- a/tests/unit/test_container_runner.py +++ b/tests/unit/test_container_runner.py @@ -12,7 +12,6 @@ from madengine.deployment.base import PERFORMANCE_LOG_PATTERN from madengine.execution.container_runner import ContainerRunner - PERF_PATTERN = PERFORMANCE_LOG_PATTERN @@ -26,19 +25,34 @@ def _match(self, log_line): # --- formats that were already handled before the regex change --- def test_basic_integer(self): - assert self._match("performance: 12345 samples_per_second") == ("12345", "samples_per_second") + assert self._match("performance: 12345 samples_per_second") == ( + "12345", + "samples_per_second", + ) def test_decimal(self): - assert self._match("performance: 100.5 samples_per_second") == ("100.5", "samples_per_second") + assert self._match("performance: 100.5 samples_per_second") == ( + "100.5", + "samples_per_second", + ) def test_scientific_lowercase_e(self): - assert self._match("performance: 1.23e+4 samples_per_second") == ("1.23e+4", "samples_per_second") + assert self._match("performance: 1.23e+4 samples_per_second") == ( + "1.23e+4", + "samples_per_second", + ) def test_scientific_negative_exponent(self): - assert self._match("performance: 1.23e-4 samples_per_second") == ("1.23e-4", "samples_per_second") + assert self._match("performance: 1.23e-4 samples_per_second") == ( + "1.23e-4", + "samples_per_second", + ) def test_zero(self): - assert self._match("performance: 0 samples_per_second") == ("0", "samples_per_second") + assert self._match("performance: 0 samples_per_second") == ( + "0", + "samples_per_second", + ) def test_metric_with_digits(self): assert self._match("performance: 123 metric123") == ("123", "metric123") @@ -50,15 +64,24 @@ def test_metric_starting_with_underscore(self): def test_unit_suffix_slash_s(self): """Value followed by /s unit suffix: suffix is stripped, metric parsed correctly.""" - assert self._match("performance: 14164/s samples_per_second") == ("14164", "samples_per_second") + assert self._match("performance: 14164/s samples_per_second") == ( + "14164", + "samples_per_second", + ) def test_unit_suffix_and_comma(self): """Value with /s suffix and comma separator.""" - assert self._match("performance: 14164.5/s, samples_per_second") == ("14164.5", "samples_per_second") + assert self._match("performance: 14164.5/s, samples_per_second") == ( + "14164.5", + "samples_per_second", + ) def test_comma_separator_no_suffix(self): """Comma after value without a unit suffix.""" - assert self._match("performance: 100.5, samples_per_second") == ("100.5", "samples_per_second") + assert self._match("performance: 100.5, samples_per_second") == ( + "100.5", + "samples_per_second", + ) def test_comma_before_suffix(self): """Comma immediately before /s suffix: 123,/s metric.""" @@ -72,25 +95,40 @@ def test_comma_space_before_suffix(self): def test_scientific_uppercase_e(self): """Uppercase E in scientific notation (v1 supported, old v2 broke on this).""" - assert self._match("performance: 1.23E+4 samples_per_second") == ("1.23E+4", "samples_per_second") + assert self._match("performance: 1.23E+4 samples_per_second") == ( + "1.23E+4", + "samples_per_second", + ) def test_positive_sign(self): """Explicitly signed positive value (v1 supported via [+|-]? prefix).""" - assert self._match("performance: +123.45 samples_per_second") == ("+123.45", "samples_per_second") + assert self._match("performance: +123.45 samples_per_second") == ( + "+123.45", + "samples_per_second", + ) def test_negative_sign(self): """Signed negative value (v1 supported).""" - assert self._match("performance: -123.45 samples_per_second") == ("-123.45", "samples_per_second") + assert self._match("performance: -123.45 samples_per_second") == ( + "-123.45", + "samples_per_second", + ) def test_leading_dot_decimal(self): """Leading-dot decimal without integer part (v1 supported via [0-9]*[.]?[0-9]*).""" - assert self._match("performance: .5 samples_per_second") == (".5", "samples_per_second") + assert self._match("performance: .5 samples_per_second") == ( + ".5", + "samples_per_second", + ) # --- slash-containing metric names (e.g. samples/sec, tokens/sec) --- def test_metric_samples_per_sec_slash(self): """samples/sec metric (used by _determine_aggregation_method) is parsed.""" - assert self._match("performance: 1234.5 samples/sec") == ("1234.5", "samples/sec") + assert self._match("performance: 1234.5 samples/sec") == ( + "1234.5", + "samples/sec", + ) def test_metric_tokens_per_sec_slash(self): """tokens/sec metric (used by _determine_aggregation_method) is parsed.""" @@ -137,7 +175,9 @@ class TestCreateSetupFailurePerfEntry: def test_returns_dict_with_status_failure(self): """Entry has status FAILURE and model name.""" runner = ContainerRunner(context=MagicMock(), console=MagicMock()) - runner.context.ctx = {"docker_env_vars": {"MAD_SYSTEM_GPU_ARCHITECTURE": "gfx90a"}} + runner.context.ctx = { + "docker_env_vars": {"MAD_SYSTEM_GPU_ARCHITECTURE": "gfx90a"} + } model_info = {"name": "org/model1", "tags": "v1", "n_gpus": "2"} build_info = {"dockerfile": "Dockerfile", "docker_image": "img:latest"} @@ -195,9 +235,16 @@ def test_setup_failure_appends_to_failed_runs_and_records_to_csv( ) manifest = { - "built_images": {"img1": {"docker_image": "local/img1", "dockerfile": "D"}}, + "built_images": { + "img1": {"docker_image": "local/img1", "dockerfile": "D"} + }, "built_models": { - "img1": {"name": "test/model", "tags": "t1", "n_gpus": "1", "args": ""} + "img1": { + "name": "test/model", + "tags": "t1", + "n_gpus": "1", + "args": "", + } }, } with open(manifest_path, "w") as f: diff --git a/tests/unit/test_container_runner_helpers.py b/tests/unit/test_container_runner_helpers.py index a4d96539..2d862a69 100644 --- a/tests/unit/test_container_runner_helpers.py +++ b/tests/unit/test_container_runner_helpers.py @@ -110,10 +110,7 @@ def test_excludes_grep_meta_line(self): assert log_text_has_error_pattern(log, "RuntimeError:", [], ()) def test_regex_benign_excludes_rocprof_style_line(self): - log = ( - "E12345678 generateRocpd.cpp: noise\n" - "clean RuntimeError: real issue\n" - ) + log = "E12345678 generateRocpd.cpp: noise\n" "clean RuntimeError: real issue\n" assert log_text_has_error_pattern( log, "RuntimeError:", diff --git a/tests/unit/test_context_logic.py b/tests/unit/test_context_logic.py index 17d1de5d..28ebfa85 100644 --- a/tests/unit/test_context_logic.py +++ b/tests/unit/test_context_logic.py @@ -6,8 +6,9 @@ Copyright (c) Advanced Micro Devices, Inc. All rights reserved. """ +from unittest.mock import MagicMock, Mock, patch + import pytest -from unittest.mock import Mock, MagicMock, patch from madengine.core.context import Context from madengine.utils.gpu_validator import GPUVendor @@ -16,7 +17,7 @@ @pytest.mark.unit class TestContextInitialization: """Test Context object initialization.""" - + @patch.object(Context, "get_gpu_renderD_nodes", return_value=None) @patch.object(Context, "get_docker_gpus", return_value="0") @patch.object(Context, "get_system_gpu_product_name", return_value="Test GPU") @@ -36,11 +37,11 @@ def test_context_initializes_with_defaults( ): """Context should initialize with system defaults.""" context = Context() - + assert context.get_gpu_vendor() == "AMD" assert context.get_system_ngpus() == 1 assert context.get_system_gpu_architecture() == "gfx90a" - + # REMOVED: test_context_detects_nvidia_gpus and test_context_handles_cpu_only # These tests require actual GPU detection and are better suited as integration tests. # Context initialization tests are covered in integration/test_platform_integration.py @@ -49,7 +50,7 @@ def test_context_initializes_with_defaults( @pytest.mark.unit class TestBuildArgGeneration: """Test Docker build argument generation logic.""" - + @patch.object(Context, "get_gpu_renderD_nodes", return_value=None) @patch.object(Context, "get_docker_gpus", return_value="0") @patch.object(Context, "get_system_gpu_product_name", return_value="Test GPU") @@ -72,12 +73,14 @@ def test_generates_build_args_for_amd( context.ctx = { "docker_build_arg": { "MAD_GPU_VENDOR": "AMD", - "MAD_SYSTEM_GPU_ARCHITECTURE": "gfx90a" + "MAD_SYSTEM_GPU_ARCHITECTURE": "gfx90a", } } - + assert context.ctx["docker_build_arg"]["MAD_GPU_VENDOR"] == "AMD" - assert context.ctx["docker_build_arg"]["MAD_SYSTEM_GPU_ARCHITECTURE"] == "gfx90a" + assert ( + context.ctx["docker_build_arg"]["MAD_SYSTEM_GPU_ARCHITECTURE"] == "gfx90a" + ) @pytest.mark.unit @@ -101,9 +104,9 @@ def _make_build_only_ctx(additional_context="{}") -> Context: Returns a fully constructed Context whose ctx dict is populated from additional_context but whose init_build_context has NOT yet run, so callers can invoke it in a controlled way. """ - with patch.object(Context, "init_build_context"), \ - patch.object(Context, "get_ctx_test", return_value="test"), \ - patch.object(Context, "get_host_os", return_value="linux"): + with patch.object(Context, "init_build_context"), patch.object( + Context, "get_ctx_test", return_value="test" + ), patch.object(Context, "get_host_os", return_value="linux"): ctx = Context(additional_context=additional_context, build_only_mode=True) return ctx @@ -122,11 +125,19 @@ def test_auto_detect_injects_arch_when_absent(self): # get_gpu_tool_manager is a module-level import in context.py; patch it there. # detect_gpu_vendor / normalize_architecture_name are imported locally inside # init_build_context, so patch them at their source modules. - with patch("madengine.core.context.get_gpu_tool_manager", return_value=manager), \ - patch("madengine.utils.gpu_validator.detect_gpu_vendor", return_value=GPUVendor.AMD), \ - patch("madengine.execution.dockerfile_utils.normalize_architecture_name", return_value="gfx942"), \ - patch.object(Context, "get_ctx_test", return_value="test"), \ - patch.object(Context, "get_host_os", return_value="linux"): + with patch( + "madengine.core.context.get_gpu_tool_manager", return_value=manager + ), patch( + "madengine.utils.gpu_validator.detect_gpu_vendor", + return_value=GPUVendor.AMD, + ), patch( + "madengine.execution.dockerfile_utils.normalize_architecture_name", + return_value="gfx942", + ), patch.object( + Context, "get_ctx_test", return_value="test" + ), patch.object( + Context, "get_host_os", return_value="linux" + ): ctx.init_build_context(detect_gpu_arch=True) assert ctx.ctx["docker_build_arg"]["MAD_SYSTEM_GPU_ARCHITECTURE"] == "gfx942" @@ -140,11 +151,19 @@ def test_auto_detect_does_not_override_user_value(self): manager = MagicMock() manager.get_gpu_architecture.return_value = "gfx942" - with patch("madengine.core.context.get_gpu_tool_manager", return_value=manager), \ - patch("madengine.utils.gpu_validator.detect_gpu_vendor", return_value=GPUVendor.AMD), \ - patch("madengine.execution.dockerfile_utils.normalize_architecture_name", return_value="gfx942"), \ - patch.object(Context, "get_ctx_test", return_value="test"), \ - patch.object(Context, "get_host_os", return_value="linux"): + with patch( + "madengine.core.context.get_gpu_tool_manager", return_value=manager + ), patch( + "madengine.utils.gpu_validator.detect_gpu_vendor", + return_value=GPUVendor.AMD, + ), patch( + "madengine.execution.dockerfile_utils.normalize_architecture_name", + return_value="gfx942", + ), patch.object( + Context, "get_ctx_test", return_value="test" + ), patch.object( + Context, "get_host_os", return_value="linux" + ): ctx.init_build_context(detect_gpu_arch=True) # User value must be preserved; auto-detect must not overwrite it. @@ -154,10 +173,14 @@ def test_auto_detect_warns_on_no_gpu(self): """Should warn (not crash) when no supported GPU is detected.""" ctx = _make_build_only_ctx() - with patch("madengine.utils.gpu_validator.detect_gpu_vendor", return_value=GPUVendor.UNKNOWN), \ - patch.object(Context, "get_ctx_test", return_value="test"), \ - patch.object(Context, "get_host_os", return_value="linux"), \ - patch("builtins.print") as mock_print: + with patch( + "madengine.utils.gpu_validator.detect_gpu_vendor", + return_value=GPUVendor.UNKNOWN, + ), patch.object(Context, "get_ctx_test", return_value="test"), patch.object( + Context, "get_host_os", return_value="linux" + ), patch( + "builtins.print" + ) as mock_print: ctx.init_build_context(detect_gpu_arch=True) msgs = [str(c.args[0]) for c in mock_print.call_args_list if c.args] @@ -168,10 +191,14 @@ def test_auto_detect_handles_exception_gracefully(self): """Detection failure should warn, not raise.""" ctx = _make_build_only_ctx() - with patch("madengine.utils.gpu_validator.detect_gpu_vendor", side_effect=RuntimeError("rocminfo not found")), \ - patch.object(Context, "get_ctx_test", return_value="test"), \ - patch.object(Context, "get_host_os", return_value="linux"), \ - patch("builtins.print") as mock_print: + with patch( + "madengine.utils.gpu_validator.detect_gpu_vendor", + side_effect=RuntimeError("rocminfo not found"), + ), patch.object(Context, "get_ctx_test", return_value="test"), patch.object( + Context, "get_host_os", return_value="linux" + ), patch( + "builtins.print" + ) as mock_print: ctx.init_build_context(detect_gpu_arch=True) msgs = [str(c.args[0]) for c in mock_print.call_args_list if c.args] @@ -182,9 +209,13 @@ def test_no_detection_when_flag_is_false(self): """detect_gpu_arch=False should skip detection entirely.""" ctx = _make_build_only_ctx() - with patch("madengine.utils.gpu_validator.detect_gpu_vendor") as mock_detect, \ - patch.object(Context, "get_ctx_test", return_value="test"), \ - patch.object(Context, "get_host_os", return_value="linux"): + with patch( + "madengine.utils.gpu_validator.detect_gpu_vendor" + ) as mock_detect, patch.object( + Context, "get_ctx_test", return_value="test" + ), patch.object( + Context, "get_host_os", return_value="linux" + ): ctx.init_build_context(detect_gpu_arch=False) mock_detect.assert_not_called() diff --git a/tests/unit/test_database_mongodb.py b/tests/unit/test_database_mongodb.py index b9f0aa65..d53a70a9 100644 --- a/tests/unit/test_database_mongodb.py +++ b/tests/unit/test_database_mongodb.py @@ -12,26 +12,27 @@ import tempfile from pathlib import Path from unittest.mock import MagicMock, patch + import pytest from madengine.database.mongodb import ( + CSVLoader, + DocumentTransformer, + FileFormat, + JSONLoader, MongoDBConfig, UploadOptions, UploadResult, - FileFormat, - JSONLoader, - CSVLoader, - DocumentTransformer, detect_file_format, get_loader, upload_file_to_mongodb, ) - # ============================================================================ # Fixtures # ============================================================================ + @pytest.fixture def sample_json_data(): """Sample JSON data with native types.""" @@ -41,39 +42,31 @@ def sample_json_data(): "performance": 123.45, "metric": "tokens/sec", "status": "SUCCESS", - "configs": { - "batch_size": 32, - "learning_rate": 0.001 - }, + "configs": {"batch_size": 32, "learning_rate": 0.001}, "enabled": True, - "timestamp": "2026-01-07 10:00:00" + "timestamp": "2026-01-07 10:00:00", }, { "model": "test_model_2", "performance": 234.56, "metric": "tokens/sec", "status": "SUCCESS", - "configs": { - "batch_size": 64, - "learning_rate": 0.002 - }, + "configs": {"batch_size": 64, "learning_rate": 0.002}, "enabled": False, - "timestamp": "2026-01-07 10:05:00" - } + "timestamp": "2026-01-07 10:05:00", + }, ] @pytest.fixture def temp_json_file(sample_json_data): """Create a temporary JSON file.""" - with tempfile.NamedTemporaryFile( - mode='w', suffix='.json', delete=False - ) as f: + with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f: json.dump(sample_json_data, f) file_path = f.name - + yield Path(file_path) - + # Cleanup if os.path.exists(file_path): os.unlink(file_path) @@ -83,15 +76,15 @@ def temp_json_file(sample_json_data): def temp_csv_file(): """Create a temporary CSV file.""" with tempfile.NamedTemporaryFile( - mode='w', suffix='.csv', delete=False, newline='' + mode="w", suffix=".csv", delete=False, newline="" ) as f: f.write("model,performance,metric,status,timestamp\n") f.write("csv_model_1,345.67,tokens/sec,SUCCESS,2026-01-07 11:00:00\n") f.write("csv_model_2,456.78,tokens/sec,SUCCESS,2026-01-07 11:05:00\n") file_path = f.name - + yield Path(file_path) - + # Cleanup if os.path.exists(file_path): os.unlink(file_path) @@ -101,11 +94,12 @@ def temp_csv_file(): # Configuration Tests # ============================================================================ + @pytest.mark.unit def test_mongodb_config_defaults(): """Test MongoDBConfig with default values.""" config = MongoDBConfig() - + assert config.host == "localhost" assert config.port == 27017 assert config.username == "" @@ -122,10 +116,10 @@ def test_mongodb_config_from_env(): "MONGO_USER": "testuser", "MONGO_PASSWORD": "testpass", } - + with patch.dict(os.environ, env_vars, clear=False): config = MongoDBConfig.from_env() - + assert config.host == "test-host" assert config.port == 27018 assert config.username == "testuser" @@ -136,12 +130,9 @@ def test_mongodb_config_from_env(): def test_mongodb_config_uri_with_auth(): """Test MongoDB URI generation with authentication.""" config = MongoDBConfig( - host="example.com", - port=27017, - username="user", - password="pass" + host="example.com", port=27017, username="user", password="pass" ) - + assert config.uri == "mongodb://user:pass@example.com:27017/admin" @@ -149,7 +140,7 @@ def test_mongodb_config_uri_with_auth(): def test_mongodb_config_uri_without_auth(): """Test MongoDB URI generation without authentication.""" config = MongoDBConfig(host="example.com", port=27017) - + assert config.uri == "mongodb://example.com:27017" @@ -157,7 +148,7 @@ def test_mongodb_config_uri_without_auth(): def test_upload_options_defaults(): """Test UploadOptions default values.""" options = UploadOptions() - + assert options.unique_fields is None assert options.upsert is True assert options.batch_size == 1000 @@ -171,6 +162,7 @@ def test_upload_options_defaults(): # File Detection Tests # ============================================================================ + @pytest.mark.unit def test_detect_json_format_by_extension(temp_json_file): """Test JSON format detection by file extension.""" @@ -188,12 +180,10 @@ def test_detect_csv_format_by_extension(temp_csv_file): @pytest.mark.unit def test_detect_json_format_by_content(): """Test JSON format detection by content when no extension.""" - with tempfile.NamedTemporaryFile( - mode='w', suffix='', delete=False - ) as f: + with tempfile.NamedTemporaryFile(mode="w", suffix="", delete=False) as f: json.dump({"test": "data"}, f) file_path = f.name - + try: file_format = detect_file_format(Path(file_path)) assert file_format == FileFormat.JSON @@ -219,12 +209,13 @@ def test_get_loader_csv(): # JSON Loader Tests # ============================================================================ + @pytest.mark.unit def test_json_loader_load_array(temp_json_file, sample_json_data): """Test JSONLoader with array format.""" loader = JSONLoader() documents = loader.load(temp_json_file) - + assert len(documents) == 2 assert documents[0]["model"] == "test_model_1" assert documents[0]["performance"] == 123.45 @@ -236,17 +227,15 @@ def test_json_loader_load_array(temp_json_file, sample_json_data): def test_json_loader_load_single_object(): """Test JSONLoader with single object format.""" data = {"model": "test", "value": 42} - - with tempfile.NamedTemporaryFile( - mode='w', suffix='.json', delete=False - ) as f: + + with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f: json.dump(data, f) file_path = f.name - + try: loader = JSONLoader() documents = loader.load(Path(file_path)) - + assert len(documents) == 1 assert documents[0]["model"] == "test" assert documents[0]["value"] == 42 @@ -259,7 +248,7 @@ def test_json_loader_preserves_types(temp_json_file): """Test that JSONLoader preserves native types.""" loader = JSONLoader() documents = loader.load(temp_json_file) - + doc = documents[0] assert isinstance(doc["performance"], float) assert isinstance(doc["configs"], dict) @@ -272,7 +261,7 @@ def test_json_loader_infer_schema(sample_json_data): """Test JSON schema inference.""" loader = JSONLoader() schema = loader.infer_schema(sample_json_data) - + assert schema["model"] == str assert schema["performance"] == float assert schema["configs"] == dict @@ -283,12 +272,13 @@ def test_json_loader_infer_schema(sample_json_data): # CSV Loader Tests # ============================================================================ + @pytest.mark.unit def test_csv_loader_load(temp_csv_file): """Test CSVLoader basic loading.""" loader = CSVLoader() documents = loader.load(temp_csv_file) - + assert len(documents) == 2 assert documents[0]["model"] == "csv_model_1" assert documents[1]["model"] == "csv_model_2" @@ -299,7 +289,7 @@ def test_csv_loader_type_inference(temp_csv_file): """Test that CSVLoader infers types correctly.""" loader = CSVLoader() documents = loader.load(temp_csv_file) - + doc = documents[0] # Performance should be float, not string assert isinstance(doc["performance"], (float, int)) @@ -310,16 +300,16 @@ def test_csv_loader_type_inference(temp_csv_file): def test_csv_loader_json_string_parsing(): """Test that CSVLoader parses JSON strings in columns.""" with tempfile.NamedTemporaryFile( - mode='w', suffix='.csv', delete=False, newline='' + mode="w", suffix=".csv", delete=False, newline="" ) as f: - f.write('model,configs\n') + f.write("model,configs\n") f.write('test,"{""lr"": 0.001}"\n') file_path = f.name - + try: loader = CSVLoader() documents = loader.load(Path(file_path)) - + # Should parse JSON string in configs column assert isinstance(documents[0]["configs"], (dict, str)) finally: @@ -330,17 +320,17 @@ def test_csv_loader_json_string_parsing(): def test_csv_loader_handles_null_values(): """Test CSVLoader handles null/missing values.""" with tempfile.NamedTemporaryFile( - mode='w', suffix='.csv', delete=False, newline='' + mode="w", suffix=".csv", delete=False, newline="" ) as f: - f.write('model,value\n') - f.write('test1,42\n') - f.write('test2,\n') # Empty value + f.write("model,value\n") + f.write("test1,42\n") + f.write("test2,\n") # Empty value file_path = f.name - + try: loader = CSVLoader() documents = loader.load(Path(file_path)) - + assert documents[0]["value"] == 42 assert documents[1]["value"] is None finally: @@ -351,15 +341,16 @@ def test_csv_loader_handles_null_values(): # Document Transformer Tests # ============================================================================ + @pytest.mark.unit def test_document_transformer_adds_metadata(): """Test that transformer adds metadata fields.""" options = UploadOptions(add_metadata=True) transformer = DocumentTransformer(options) - + documents = [{"model": "test", "value": 42}] transformed = transformer.transform(documents) - + assert "_meta_uploaded_at" in transformed[0] assert "created_date" in transformed[0] @@ -369,11 +360,11 @@ def test_document_transformer_preserves_existing_metadata(): """Test that transformer preserves existing created_date.""" options = UploadOptions(add_metadata=True) transformer = DocumentTransformer(options) - + original_date = "2026-01-01 00:00:00" documents = [{"model": "test", "created_date": original_date}] transformed = transformer.transform(documents) - + assert transformed[0]["created_date"] == original_date @@ -382,14 +373,14 @@ def test_document_transformer_infer_unique_fields(): """Test automatic unique field inference.""" options = UploadOptions() transformer = DocumentTransformer(options) - + documents = [ {"model": "model1", "timestamp": "2026-01-01", "value": 1}, {"model": "model2", "timestamp": "2026-01-02", "value": 2}, ] - + unique_fields = transformer.infer_unique_fields(documents) - + assert "model" in unique_fields @@ -398,10 +389,10 @@ def test_document_transformer_no_metadata_when_disabled(): """Test that metadata is not added when disabled.""" options = UploadOptions(add_metadata=False) transformer = DocumentTransformer(options) - + documents = [{"model": "test", "value": 42}] transformed = transformer.transform(documents) - + assert "_meta_uploaded_at" not in transformed[0] @@ -409,6 +400,7 @@ def test_document_transformer_no_metadata_when_disabled(): # Upload Result Tests # ============================================================================ + @pytest.mark.unit def test_upload_result_success_status(): """Test UploadResult with success status.""" @@ -419,9 +411,9 @@ def test_upload_result_success_status(): documents_inserted=8, documents_updated=2, documents_failed=0, - duration_seconds=1.5 + duration_seconds=1.5, ) - + assert result.status == "success" assert result.documents_read == 10 assert result.documents_inserted == 8 @@ -439,9 +431,9 @@ def test_upload_result_with_errors(): documents_updated=1, documents_failed=2, errors=["Error 1", "Error 2"], - duration_seconds=2.0 + duration_seconds=2.0, ) - + assert result.status == "partial" assert result.documents_failed == 2 assert len(result.errors) == 2 @@ -451,20 +443,21 @@ def test_upload_result_with_errors(): # Main Upload Function Tests (Mocked) # ============================================================================ + @pytest.mark.unit def test_upload_file_to_mongodb_json_dry_run(temp_json_file): """Test uploading JSON file in dry-run mode.""" config = MongoDBConfig() options = UploadOptions(dry_run=True) - + result = upload_file_to_mongodb( file_path=str(temp_json_file), database_name="test_db", collection_name="test_collection", config=config, - options=options + options=options, ) - + assert result.status == "success" assert result.documents_read == 2 assert result.documents_processed == 0 @@ -476,15 +469,15 @@ def test_upload_file_to_mongodb_csv_dry_run(temp_csv_file): """Test uploading CSV file in dry-run mode.""" config = MongoDBConfig() options = UploadOptions(dry_run=True) - + result = upload_file_to_mongodb( file_path=str(temp_csv_file), database_name="test_db", collection_name="test_collection", config=config, - options=options + options=options, ) - + assert result.status == "success" assert result.documents_read == 2 @@ -493,19 +486,16 @@ def test_upload_file_to_mongodb_csv_dry_run(temp_csv_file): def test_upload_file_to_mongodb_auto_detects_unique_fields(temp_json_file): """Test that upload auto-detects unique fields.""" config = MongoDBConfig() - options = UploadOptions( - dry_run=True, - unique_fields=None # Should auto-detect - ) - + options = UploadOptions(dry_run=True, unique_fields=None) # Should auto-detect + result = upload_file_to_mongodb( file_path=str(temp_json_file), database_name="test_db", collection_name="test_collection", config=config, - options=options + options=options, ) - + assert result.status == "success" # Options should have been updated with detected fields assert options.unique_fields is not None @@ -516,14 +506,14 @@ def test_upload_file_to_mongodb_file_not_found(): """Test upload with non-existent file.""" config = MongoDBConfig() options = UploadOptions() - + with pytest.raises(FileNotFoundError): upload_file_to_mongodb( file_path="/nonexistent/file.json", database_name="test_db", collection_name="test_collection", config=config, - options=options + options=options, ) @@ -531,25 +521,22 @@ def test_upload_file_to_mongodb_file_not_found(): def test_upload_file_to_mongodb_with_custom_unique_fields(temp_json_file): """Test upload with custom unique fields.""" config = MongoDBConfig() - options = UploadOptions( - dry_run=True, - unique_fields=["model", "timestamp"] - ) - + options = UploadOptions(dry_run=True, unique_fields=["model", "timestamp"]) + result = upload_file_to_mongodb( file_path=str(temp_json_file), database_name="test_db", collection_name="test_collection", config=config, - options=options + options=options, ) - + assert result.status == "success" assert options.unique_fields == ["model", "timestamp"] @pytest.mark.unit -@patch('madengine.database.mongodb.MongoDBUploader') +@patch("madengine.database.mongodb.MongoDBUploader") def test_upload_file_to_mongodb_calls_uploader(mock_uploader_class, temp_json_file): """Test that upload function properly calls MongoDBUploader.""" # Setup mock @@ -562,20 +549,20 @@ def test_upload_file_to_mongodb_calls_uploader(mock_uploader_class, temp_json_fi documents_inserted=2, documents_updated=0, documents_failed=0, - duration_seconds=0.1 + duration_seconds=0.1, ) - + config = MongoDBConfig() options = UploadOptions(dry_run=False) - + result = upload_file_to_mongodb( file_path=str(temp_json_file), database_name="test_db", collection_name="test_collection", config=config, - options=options + options=options, ) - + # Verify uploader was called mock_uploader.upload.assert_called_once() assert result.status == "success" diff --git a/tests/unit/test_deployment.py b/tests/unit/test_deployment.py index 4d5ad4dd..b7530779 100644 --- a/tests/unit/test_deployment.py +++ b/tests/unit/test_deployment.py @@ -15,9 +15,9 @@ tools_include_rocprof_family, ) - # ---- deployment.base (create_jinja_env) ---- + class TestCreateJinjaEnv: """Test create_jinja_env helper.""" @@ -25,7 +25,9 @@ def test_returns_environment_with_dirname_basename_filters(self): """create_jinja_env returns Environment with dirname and basename filters.""" with tempfile.TemporaryDirectory() as tmpdir: p = Path(tmpdir) - (p / "test.j2").write_text("dir={{ path | dirname }} name={{ path | basename }}") + (p / "test.j2").write_text( + "dir={{ path | dirname }} name={{ path | basename }}" + ) env = create_jinja_env(p) template = env.get_template("test.j2") out = template.render(path="/foo/bar/baz.txt") @@ -43,6 +45,7 @@ def test_template_dir_must_exist(self): # ---- deployment.common ---- + class TestValidLaunchers: """VALID_LAUNCHERS constant.""" @@ -79,7 +82,9 @@ def test_detects_rocprof_and_presets(self): def test_false_for_rocm_trace_lite(self): assert tools_include_rocprof_family([{"name": "rocm_trace_lite"}]) is False - assert tools_include_rocprof_family([{"name": "rocm_trace_lite_default"}]) is False + assert ( + tools_include_rocprof_family([{"name": "rocm_trace_lite_default"}]) is False + ) class TestIsRocprofv3Available: @@ -107,6 +112,7 @@ def test_returns_false_when_not_found(self): def test_returns_false_on_timeout(self): import subprocess + with patch("madengine.deployment.common.subprocess.run") as m: m.side_effect = subprocess.TimeoutExpired("rocprofv3", 5) assert is_rocprofv3_available() is False @@ -191,17 +197,29 @@ def test_multi_node_other_tools_unchanged(self, _mock_avail): # ---- BaseDeployment._parse_performance_from_log ---- + class _ConcreteDeployment(BaseDeployment): """Minimal concrete subclass to exercise BaseDeployment methods under test.""" DEPLOYMENT_TYPE = "test" - def validate(self): pass - def prepare(self): pass - def deploy(self): pass - def monitor(self, deployment_id): pass - def collect_results(self, deployment_id): pass - def cleanup(self, deployment_id): pass + def validate(self): + pass + + def prepare(self): + pass + + def deploy(self): + pass + + def monitor(self, deployment_id): + pass + + def collect_results(self, deployment_id): + pass + + def cleanup(self, deployment_id): + pass def _make_deployment(): diff --git a/tests/unit/test_discover_models.py b/tests/unit/test_discover_models.py index a8bbe43c..86084be8 100644 --- a/tests/unit/test_discover_models.py +++ b/tests/unit/test_discover_models.py @@ -57,7 +57,9 @@ def test_unscoped_inference_still_matches_all_repos(self): def test_colon_in_tag_not_treated_as_scoped(self): """model:arg keeps legacy behavior (no scope/tag split on /).""" - dm = DiscoverModels(args=argparse.Namespace(tags=["MAD-private/foo:batch-size=32"])) + dm = DiscoverModels( + args=argparse.Namespace(tags=["MAD-private/foo:batch-size=32"]) + ) dm.models = [ {"name": "MAD-private/foo", "tags": [], "args": ""}, ] @@ -113,7 +115,8 @@ def test_unscoped_tag_does_not_cross_scope_boundary(self): def test_unscoped_tag_matches_scoped_models_by_tag_field(self): """--tags inference matches any model carrying that tag, regardless of scope prefix. - Tag-list matching is always scope-agnostic; only name-based matching is scope-strict.""" + Tag-list matching is always scope-agnostic; only name-based matching is scope-strict. + """ dm = DiscoverModels(args=argparse.Namespace(tags=["inference"])) dm.models = [ {"name": "MAD/pyt_foo", "tags": ["inference"], "args": ""}, @@ -121,7 +124,10 @@ def test_unscoped_tag_matches_scoped_models_by_tag_field(self): ] dm.custom_models = [] dm.select_models() - assert sorted(m["name"] for m in dm.selected_models) == ["MAD/pyt_bar", "MAD/pyt_foo"] + assert sorted(m["name"] for m in dm.selected_models) == [ + "MAD/pyt_bar", + "MAD/pyt_foo", + ] def test_unscoped_all_selects_every_model(self): """--tags all selects every model regardless of scope.""" @@ -132,7 +138,10 @@ def test_unscoped_all_selects_every_model(self): ] dm.custom_models = [] dm.select_models() - assert sorted(m["name"] for m in dm.selected_models) == ["MAD/pyt_bar", "pyt_foo"] + assert sorted(m["name"] for m in dm.selected_models) == [ + "MAD/pyt_bar", + "pyt_foo", + ] def test_unscoped_tag_matches_root_and_scoped_by_tag_field(self): """--tags inference selects root AND scoped models that carry that tag.""" @@ -143,7 +152,10 @@ def test_unscoped_tag_matches_root_and_scoped_by_tag_field(self): ] dm.custom_models = [] dm.select_models() - assert sorted(m["name"] for m in dm.selected_models) == ["MAD/pyt_foo", "root_model"] + assert sorted(m["name"] for m in dm.selected_models) == [ + "MAD/pyt_foo", + "root_model", + ] def test_unscoped_tag_with_extra_args_matches_by_tag_field(self): """--tags inference:batch-size=32 selects by tag 'inference', not 'inference:batch-size=32'.""" diff --git a/tests/unit/test_docker_builder.py b/tests/unit/test_docker_builder.py index 3fe97f9b..e08003ac 100644 --- a/tests/unit/test_docker_builder.py +++ b/tests/unit/test_docker_builder.py @@ -35,7 +35,9 @@ def test_create_registry_image_name_uses_dockerhub_repository(docker_builder): assert out == "myorg/ci:ci-dummy_dummy.ubuntu.amd" -def test_create_registry_image_name_without_credentials_matches_local_tag(docker_builder): +def test_create_registry_image_name_without_credentials_matches_local_tag( + docker_builder, +): out = docker_builder._create_registry_image_name( "ci-dummy_dummy.ubuntu.amd", "dockerhub", diff --git a/tests/unit/test_error_handling.py b/tests/unit/test_error_handling.py index dc210a0b..c1240c08 100644 --- a/tests/unit/test_error_handling.py +++ b/tests/unit/test_error_handling.py @@ -30,7 +30,7 @@ set_error_handler, get_error_handler, handle_error, - create_error_context + create_error_context, ) @@ -45,7 +45,7 @@ def test_error_context_creation(self): phase="execution", component="TestComponent", model_name="test_model", - additional_info=additional_info + additional_info=additional_info, ) assert context.operation == "test_operation" @@ -57,7 +57,7 @@ def test_error_context_creation(self): class TestMADEngineErrorHierarchy: """Test madengine error class hierarchy.""" - + def test_base_madengine_error(self): """Test base madengine error functionality.""" context = ErrorContext(operation="test") @@ -66,9 +66,9 @@ def test_base_madengine_error(self): category=ErrorCategory.RUNTIME, context=context, recoverable=True, - suggestions=["Try again", "Check logs"] + suggestions=["Try again", "Check logs"], ) - + assert str(error) == "Test error" assert error.message == "Test error" assert error.category == ErrorCategory.RUNTIME @@ -76,24 +76,27 @@ def test_base_madengine_error(self): assert error.recoverable is True assert error.suggestions == ["Try again", "Check logs"] assert error.cause is None - - @pytest.mark.parametrize("error_class,category,recoverable,message", [ - (ValidationError, ErrorCategory.VALIDATION, True, "Invalid input"), - (NetworkError, ErrorCategory.CONNECTION, True, "Connection failed"), - (BuildError, ErrorCategory.BUILD, False, "Build failed"), - (RunnerError, ErrorCategory.RUNNER, True, "Runner execution failed"), - (AuthenticationError, ErrorCategory.AUTHENTICATION, True, "Auth failed"), - (ConfigurationError, ErrorCategory.CONFIGURATION, True, "Config error"), - ]) + + @pytest.mark.parametrize( + "error_class,category,recoverable,message", + [ + (ValidationError, ErrorCategory.VALIDATION, True, "Invalid input"), + (NetworkError, ErrorCategory.CONNECTION, True, "Connection failed"), + (BuildError, ErrorCategory.BUILD, False, "Build failed"), + (RunnerError, ErrorCategory.RUNNER, True, "Runner execution failed"), + (AuthenticationError, ErrorCategory.AUTHENTICATION, True, "Auth failed"), + (ConfigurationError, ErrorCategory.CONFIGURATION, True, "Config error"), + ], + ) def test_error_types(self, error_class, category, recoverable, message): """Test all error types with parametrized test.""" error = error_class(message) - + assert isinstance(error, MADEngineError) assert error.category == category assert error.recoverable is recoverable assert str(error) == message - + def test_error_with_cause(self): """Test error with underlying cause.""" original_error = ValueError("Original error") @@ -120,13 +123,10 @@ def setup_method(self): def test_handle_madengine_error(self): """Test handling of madengine structured errors.""" context = create_error_context( - operation="test_operation", - component="TestComponent" + operation="test_operation", component="TestComponent" ) error = ValidationError( - "Test validation error", - context=context, - suggestions=["Check input"] + "Test validation error", context=context, suggestions=["Check input"] ) self.error_handler.handle_error(error) @@ -160,8 +160,6 @@ def test_set_and_get_error_handler(self): assert retrieved_handler == handler - - class TestErrorRecoveryAndSuggestions: """Test error recovery indicators and suggestions.""" @@ -176,15 +174,13 @@ def test_non_recoverable_errors(self): assert BuildError("Build error").recoverable is False - - class TestErrorPatternMatching: """Test error pattern matching for log analysis. - + These tests validate the error pattern fixes for GPT2 training, ensuring ROCProf logs are correctly excluded while real errors are caught. """ - + @pytest.fixture def benign_patterns(self): """Benign patterns that should be excluded from error detection.""" @@ -199,7 +195,7 @@ def benign_patterns(self): "rocpd_op:", "rpd_tracer:", ] - + @pytest.fixture def error_patterns(self): """Error patterns that should be detected in logs.""" @@ -218,7 +214,7 @@ def error_patterns(self): "ImportError:", "ModuleNotFoundError:", ] - + def test_benign_patterns_match_rocprof_logs(self, benign_patterns): """Test that benign patterns correctly match ROCProf logs.""" # Test cases that should be excluded (false positives) @@ -230,11 +226,11 @@ def test_benign_patterns_match_rocprof_logs(self, benign_patterns): "rocpd_op: 0", "rpd_tracer: finalized in 50.142105 ms", ] - + for test_line in rocprof_messages: matched = any(re.search(pattern, test_line) for pattern in benign_patterns) assert matched, f"Failed to match ROCProf log: {test_line[:80]}" - + def test_error_patterns_catch_real_errors(self, error_patterns): """Test that error patterns correctly catch real errors.""" # Test cases that should be caught (real errors) @@ -247,11 +243,11 @@ def test_error_patterns_catch_real_errors(self, error_patterns): "AssertionError: Expected shape (2, 3) but got (3, 2)", "torch.distributed.elastic.multiprocessing.errors.ChildFailedError: FAILED", ] - + for test_line in real_errors: matched = any(re.search(pattern, test_line) for pattern in error_patterns) assert matched, f"Failed to catch error: {test_line[:80]}" - + def test_rocprof_messages_dont_trigger_errors(self, error_patterns): """Test that ROCProf messages don't trigger error patterns.""" # ROCProf messages that should NOT trigger errors @@ -261,11 +257,13 @@ def test_rocprof_messages_dont_trigger_errors(self, error_patterns): "rocpd_op: 0", "rpd_tracer: finalized in 50.142105 ms", ] - + for test_line in rocprof_messages: matched = any(re.search(pattern, test_line) for pattern in error_patterns) - assert not matched, f"False positive: {test_line[:80]} matched error pattern" + assert ( + not matched + ), f"False positive: {test_line[:80]} matched error pattern" if __name__ == "__main__": - pytest.main([__file__, "-v"]) \ No newline at end of file + pytest.main([__file__, "-v"]) diff --git a/tests/unit/test_errors.py b/tests/unit/test_errors.py index 078e2d57..fa2b32fc 100644 --- a/tests/unit/test_errors.py +++ b/tests/unit/test_errors.py @@ -24,6 +24,7 @@ # ---- CLI error integration ---- + class TestCLIErrorIntegration: """CLI error handling setup and display.""" @@ -50,7 +51,9 @@ def test_build_command_error_handling(self): setup_logging(verbose=False) error = Exception("Test build error") - context = create_error_context(operation="build", phase="build", component="CLI") + context = create_error_context( + operation="build", phase="build", component="CLI" + ) handle_error(error, context=context) @patch("madengine.cli.utils.console") @@ -62,7 +65,9 @@ def test_cli_error_display_consistency(self, mock_console): handler = get_error_handler() error = ConfigurationError( "Invalid configuration", - context=create_error_context(operation="cli_command", component="CLI", phase="validation"), + context=create_error_context( + operation="cli_command", component="CLI", phase="validation" + ), ) handler.handle_error(error) assert handler.console is not None @@ -70,6 +75,7 @@ def test_cli_error_display_consistency(self, mock_console): # ---- Error workflow ---- + class TestErrorWorkflow: """End-to-end error flow and logging.""" @@ -134,11 +140,14 @@ def test_error_context_serialization(self): ) error = ExecutionError("Model execution failed", context=context) data = json.dumps(error.context.__dict__, default=str) - assert "model_execution" in data and "ContainerRunner" in data and "abc123" in data + assert ( + "model_execution" in data and "ContainerRunner" in data and "abc123" in data + ) # ---- Unified error system ---- + class TestUnifiedErrorSystem: """Unified error handling system.""" @@ -147,7 +156,9 @@ def test_error_system_basic_functionality(self): mock_console = Mock() handler = ErrorHandler(console=mock_console, verbose=False) context = create_error_context( - operation="test_operation", component="TestComponent", model_name="test_model" + operation="test_operation", + component="TestComponent", + model_name="test_model", ) error = ValidationError("Test validation error", context=context) handler.handle_error(error) @@ -218,7 +229,9 @@ def test_global_error_handler_workflow(self): set_error_handler(handler) error = ValidationError( "Global handler test", - context=create_error_context(operation="global_test", component="TestGlobalHandler"), + context=create_error_context( + operation="global_test", component="TestGlobalHandler" + ), ) handle_error(error) mock_console.print.assert_called_once() @@ -241,14 +254,20 @@ def test_error_suggestions_and_recovery(self): def test_nested_error_handling(self): """Nested errors with cause chain are handled.""" - from madengine.core.errors import ExecutionError as MADRuntimeError, OrchestrationError, NetworkError + from madengine.core.errors import ( + ExecutionError as MADRuntimeError, + OrchestrationError, + NetworkError, + ) orig = NetworkError("Network timeout") runtime = MADRuntimeError("Operation failed", cause=orig) final = OrchestrationError("Orchestration failed", cause=runtime) assert final.cause == runtime and runtime.cause == orig mock_console = Mock() - ErrorHandler(console=mock_console, verbose=True).handle_error(final, show_traceback=True) + ErrorHandler(console=mock_console, verbose=True).handle_error( + final, show_traceback=True + ) assert mock_console.print.call_count >= 1 def test_error_performance(self): @@ -261,7 +280,9 @@ def test_error_performance(self): for i in range(100): err = ValidationError( f"Test error {i}", - context=create_error_context(operation=f"test_op_{i}", component="PerformanceTest"), + context=create_error_context( + operation=f"test_op_{i}", component="PerformanceTest" + ), ) handler.handle_error(err) assert time.time() - start < 1.0 @@ -270,6 +291,7 @@ def test_error_performance(self): # ---- Performance (lightweight) ---- + class TestErrorHandlingPerformance: """Error handler and context creation performance.""" @@ -290,13 +312,17 @@ def test_error_context_creation_performance(self): start = time.time() for i in range(1000): create_error_context( - operation=f"op_{i}", component=f"C_{i}", phase="test", model_name=f"m_{i}" + operation=f"op_{i}", + component=f"C_{i}", + phase="test", + model_name=f"m_{i}", ) assert time.time() - start < 0.1 # ---- Backward compatibility ---- + class TestErrorSystemBackwardCompatibility: """Backward compatibility of the error system.""" @@ -307,7 +333,9 @@ def test_legacy_exception_handling_still_works(self): except Exception as e: mock_console = Mock() handler = ErrorHandler(console=mock_console) - context = create_error_context(operation="legacy_handling", component="LegacyTest") + context = create_error_context( + operation="legacy_handling", component="LegacyTest" + ) handler.handle_error(e, context=context) mock_console.print.assert_called_once() diff --git a/tests/unit/test_execution.py b/tests/unit/test_execution.py index dc18121e..c17ab291 100644 --- a/tests/unit/test_execution.py +++ b/tests/unit/test_execution.py @@ -16,9 +16,9 @@ parse_dockerfile_gpu_variables, ) - # ---- Timeout ---- + class TestTimeout: """Timeout context manager: None/0 must not arm signal.alarm.""" @@ -34,11 +34,13 @@ def test_positive_seconds_raises_on_expiry(self): with pytest.raises(TimeoutError): with Timeout(1): import time + time.sleep(2) # ---- container_runner_helpers ---- + class TestResolveRunTimeout: """resolve_run_timeout behavior.""" @@ -59,8 +61,13 @@ def test_falsy_model_timeout_ignored_uses_cli(self, model_timeout): assert resolve_run_timeout({"timeout": model_timeout}, 7200) == 7200 def test_custom_default_cli(self): - assert resolve_run_timeout({"timeout": 100}, 5000, default_cli_timeout=5000) == 100 - assert resolve_run_timeout({"timeout": 100}, 7200, default_cli_timeout=5000) == 7200 + assert ( + resolve_run_timeout({"timeout": 100}, 5000, default_cli_timeout=5000) == 100 + ) + assert ( + resolve_run_timeout({"timeout": 100}, 7200, default_cli_timeout=5000) + == 7200 + ) def test_no_timeout_sentinel_none_passthrough(self): # --timeout 0 is converted to None by the CLI; resolve_run_timeout must @@ -94,7 +101,9 @@ class TestMakeRunLogFilePath: def test_basic_format(self): out = make_run_log_file_path( - {"name": "org/model"}, "ci-org_model_ubuntu.22.04", "", + {"name": "org/model"}, + "ci-org_model_ubuntu.22.04", + "", ) assert out == "org_model_ubuntu.22.04.live.log" @@ -104,7 +113,9 @@ def test_phase_suffix_appended(self): def test_slashes_in_model_name_replaced(self): out = make_run_log_file_path( - {"name": "foo/bar/baz"}, "ci-foo_bar_baz_ubuntu", "", + {"name": "foo/bar/baz"}, + "ci-foo_bar_baz_ubuntu", + "", ) assert "/" not in out assert out.endswith(".live.log") @@ -116,14 +127,18 @@ def test_image_without_ci_prefix(self): def test_no_model_prefix_in_image(self): out = make_run_log_file_path( - {"name": "other/model"}, "ci-some_ubuntu_22", "", + {"name": "other/model"}, + "ci-some_ubuntu_22", + "", ) assert out == "other_model_some_ubuntu_22.live.log" def test_full_registry_ref_matches_short_ci_tag(self): """Run log name must match build log base when image is registry/name:ci-….""" model = {"name": "primus_pretrain/torchtitan_MI300X_qwen3_4B-pretrain"} - short = "ci-primus_pretrain_torchtitan_mi300x_qwen3_4b-pretrain_primus.ubuntu.amd" + short = ( + "ci-primus_pretrain_torchtitan_mi300x_qwen3_4b-pretrain_primus.ubuntu.amd" + ) full = f"rocm/mad-private:{short}" assert make_run_log_file_path(model, short, ".run") == make_run_log_file_path( model, full, ".run" @@ -136,6 +151,7 @@ def test_full_registry_ref_matches_short_ci_tag(self): # ---- dockerfile_utils ---- + class TestGpuArchVariables: def test_contains_expected_vars(self): assert "MAD_SYSTEM_GPU_ARCHITECTURE" in GPU_ARCH_VARIABLES @@ -183,22 +199,32 @@ def test_empty_returns_none(self): class TestIsTargetArchCompatibleWithVariable: def test_mad_system_always_compatible(self): - assert is_target_arch_compatible_with_variable( - "MAD_SYSTEM_GPU_ARCHITECTURE", ["gfx90a"], "gfx908" - ) is True + assert ( + is_target_arch_compatible_with_variable( + "MAD_SYSTEM_GPU_ARCHITECTURE", ["gfx90a"], "gfx908" + ) + is True + ) def test_multi_arch_target_in_list(self): - assert is_target_arch_compatible_with_variable( - "PYTORCH_ROCM_ARCH", ["gfx90a", "gfx908"], "gfx90a" - ) is True - assert is_target_arch_compatible_with_variable( - "GPU_TARGETS", ["gfx90a"], "gfx908" - ) is False + assert ( + is_target_arch_compatible_with_variable( + "PYTORCH_ROCM_ARCH", ["gfx90a", "gfx908"], "gfx90a" + ) + is True + ) + assert ( + is_target_arch_compatible_with_variable("GPU_TARGETS", ["gfx90a"], "gfx908") + is False + ) def test_gfx_compilation_exact_match(self): - assert is_target_arch_compatible_with_variable( - "GFX_COMPILATION_ARCH", ["gfx90a"], "gfx90a" - ) is True + assert ( + is_target_arch_compatible_with_variable( + "GFX_COMPILATION_ARCH", ["gfx90a"], "gfx90a" + ) + is True + ) class TestIsCompilationArchCompatible: diff --git a/tests/unit/test_hydra_config_loader.py b/tests/unit/test_hydra_config_loader.py new file mode 100644 index 00000000..6330937b --- /dev/null +++ b/tests/unit/test_hydra_config_loader.py @@ -0,0 +1,105 @@ +#!/usr/bin/env python3 +"""Tests for HydraConfigLoader.""" + +import os +import tempfile + +import pytest +from omegaconf import DictConfig + +from madengine.config.loader import HydraConfigLoader +from madengine.core.errors import ConfigurationError + + +class TestParseArgs: + def test_hydra_overrides_only(self): + user_file, overrides = HydraConfigLoader._parse_args( + ["scheduler=slurm", "distributed.nnodes=4"] + ) + assert user_file is None + assert overrides == ["scheduler=slurm", "distributed.nnodes=4"] + + def test_yaml_file_only(self): + user_file, overrides = HydraConfigLoader._parse_args(["/path/to/config.yaml"]) + assert user_file == "/path/to/config.yaml" + assert overrides == [] + + def test_yaml_file_with_overrides(self): + user_file, overrides = HydraConfigLoader._parse_args( + ["/path/to/config.yaml", "distributed.nnodes=8"] + ) + assert user_file == "/path/to/config.yaml" + assert overrides == ["distributed.nnodes=8"] + + def test_yml_extension_recognized(self): + user_file, overrides = HydraConfigLoader._parse_args(["/path/to/config.yml"]) + assert user_file == "/path/to/config.yml" + + def test_multiple_yaml_files_raises(self): + with pytest.raises(ConfigurationError, match="Only one YAML"): + HydraConfigLoader._parse_args(["/path/a.yaml", "/path/b.yaml"]) + + def test_append_override_not_treated_as_file(self): + user_file, overrides = HydraConfigLoader._parse_args(["+profile=mi300x_8gpu"]) + assert user_file is None + assert overrides == ["+profile=mi300x_8gpu"] + + def test_empty_args(self): + user_file, overrides = HydraConfigLoader._parse_args([]) + assert user_file is None + assert overrides == [] + + +class TestLoad: + def test_defaults_only(self): + cfg = HydraConfigLoader.load([]) + assert isinstance(cfg, DictConfig) + assert cfg.gpu_vendor == "AMD" + assert cfg.guest_os == "UBUNTU" + assert cfg.distributed.enabled is False + + def test_scheduler_override(self): + cfg = HydraConfigLoader.load(["scheduler=slurm"]) + assert "slurm" in cfg + assert cfg.slurm.partition == "amd-rccl" + + def test_launcher_override(self): + cfg = HydraConfigLoader.load(["launcher=torchrun"]) + assert cfg.distributed.enabled is True + assert cfg.distributed.launcher == "torchrun" + + def test_inline_value_override(self): + cfg = HydraConfigLoader.load(["launcher=torchrun", "distributed.nnodes=4"]) + assert cfg.distributed.nnodes == 4 + + def test_append_profile(self): + cfg = HydraConfigLoader.load(["+profile=mi300x_8gpu"]) + assert cfg.gpu_type == "mi300x" + assert cfg.distributed.nproc_per_node == 8 + + def test_user_yaml_file(self): + with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f: + f.write("debug: true\nenv_vars:\n MY_VAR: hello\n") + f.flush() + try: + cfg = HydraConfigLoader.load([f.name]) + assert cfg.debug is True + assert cfg.env_vars.MY_VAR == "hello" + finally: + os.unlink(f.name) + + def test_user_yaml_with_overrides(self): + with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f: + f.write("debug: true\n") + f.flush() + try: + cfg = HydraConfigLoader.load([f.name, "scheduler=slurm"]) + assert cfg.debug is True + assert "slurm" in cfg + finally: + os.unlink(f.name) + + def test_hardware_nvidia(self): + cfg = HydraConfigLoader.load(["hardware=nvidia"]) + assert cfg.gpu_vendor == "NVIDIA" + assert cfg.runtime.use_gpu_flag is True diff --git a/tests/unit/test_k8s.py b/tests/unit/test_k8s.py index 3391b70c..883d0381 100644 --- a/tests/unit/test_k8s.py +++ b/tests/unit/test_k8s.py @@ -17,11 +17,11 @@ SECRETS_STRATEGY_EXISTING, SECRETS_STRATEGY_FROM_LOCAL, SECRETS_STRATEGY_OMIT, + build_registry_secret_data, estimate_configmap_payload_bytes, merge_secrets_config, resolve_image_pull_secret_refs, resolve_runtime_secret_name, - build_registry_secret_data, ) from madengine.deployment.kubernetes import ( _pod_job_name_label_selector, @@ -102,9 +102,7 @@ def test_resolve_runtime_secret_name_existing(): def test_resolve_runtime_secret_name_omit_optional(): - assert ( - resolve_runtime_secret_name(SECRETS_STRATEGY_OMIT, {}, None) is None - ) + assert resolve_runtime_secret_name(SECRETS_STRATEGY_OMIT, {}, None) is None def test_estimate_skips_credential_when_not_in_configmap(): @@ -123,9 +121,15 @@ def test_estimate_skips_credential_when_not_in_configmap(): def test_pvc_match_exact(): assigned: set = set() - assert match_pvc_subdir_to_k8s_pod("my-pod", ["my-pod", "my-pod-0-abc"], assigned) == "my-pod" + assert ( + match_pvc_subdir_to_k8s_pod("my-pod", ["my-pod", "my-pod-0-abc"], assigned) + == "my-pod" + ) assigned.add("my-pod") - assert match_pvc_subdir_to_k8s_pod("my-pod", ["my-pod", "my-pod-0-abc"], assigned) == "my-pod-0-abc" + assert ( + match_pvc_subdir_to_k8s_pod("my-pod", ["my-pod", "my-pod-0-abc"], assigned) + == "my-pod-0-abc" + ) def test_pvc_match_prefix_indexed_job(): diff --git a/tests/unit/test_orchestration.py b/tests/unit/test_orchestration.py index ece59067..9b89d36e 100644 --- a/tests/unit/test_orchestration.py +++ b/tests/unit/test_orchestration.py @@ -1,25 +1,25 @@ """Unit tests for orchestration: image_filtering and orchestrator init/validation.""" import json +from unittest.mock import MagicMock, patch import pytest -from unittest.mock import MagicMock, patch -from madengine.orchestration.image_filtering import ( - filter_images_by_gpu_compatibility, - filter_images_by_skip_gpu_arch, -) from madengine.core.additional_context_defaults import ( DEFAULT_GPU_VENDOR, DEFAULT_GUEST_OS, ) +from madengine.core.errors import ConfigurationError from madengine.orchestration.build_orchestrator import BuildOrchestrator +from madengine.orchestration.image_filtering import ( + filter_images_by_gpu_compatibility, + filter_images_by_skip_gpu_arch, +) from madengine.orchestration.run_orchestrator import RunOrchestrator -from madengine.core.errors import ConfigurationError - # ---- image_filtering ---- + class TestFilterImagesByGpuCompatibility: """filter_images_by_gpu_compatibility behavior.""" @@ -94,9 +94,7 @@ def test_skip_gpu_arch_match_skipped(self): def test_skip_gpu_arch_nvidia_prefix_normalized(self): built = {"m1": {}} models = {"m1": {"skip_gpu_arch": "A100"}} - compat, skipped = filter_images_by_skip_gpu_arch( - built, models, "NVIDIA A100" - ) + compat, skipped = filter_images_by_skip_gpu_arch(built, models, "NVIDIA A100") assert compat == {} assert skipped[0][2] == "NVIDIA A100" @@ -110,6 +108,7 @@ def test_skip_gpu_arch_no_match_included(self): # ---- orchestrator init and validation ---- + @pytest.mark.unit class TestBuildOrchestratorInit: """Test Build Orchestrator initialization.""" @@ -221,7 +220,9 @@ def test_skip_after_build_skips_execute_local(self, mock_cleanup, tmp_path): orchestrator = RunOrchestrator(mock_args) - with patch.object(RunOrchestrator, "_build_phase", return_value=str(manifest_path)): + with patch.object( + RunOrchestrator, "_build_phase", return_value=str(manifest_path) + ): with patch.object( RunOrchestrator, "_load_and_merge_manifest", side_effect=lambda f: f ): @@ -267,7 +268,9 @@ def test_skip_ignored_when_run_only_still_calls_execute_local( "successful_runs": [], "failed_runs": [], } - orchestrator.execute(manifest_file=str(manifest_path), tags=None, timeout=60) + orchestrator.execute( + manifest_file=str(manifest_path), tags=None, timeout=60 + ) mock_local.assert_called_once() mock_cleanup.assert_called() diff --git a/tests/unit/test_primus.py b/tests/unit/test_primus.py index 36e187b0..63790cd0 100644 --- a/tests/unit/test_primus.py +++ b/tests/unit/test_primus.py @@ -19,14 +19,15 @@ merged_primus_config, ) - # --- K8s mixin: _generate_primus_command ------------------------------------- class _PrimusCommandHarness(KubernetesLauncherMixin): """Minimal object with attributes _generate_primus_command expects.""" - def __init__(self, additional_context, job_name="madengine-test", namespace="default"): + def __init__( + self, additional_context, job_name="madengine-test", namespace="default" + ): self.config = SimpleNamespace(additional_context=additional_context) self.job_name = job_name self.namespace = namespace @@ -72,7 +73,9 @@ def test_single_node_with_backend_override(self): } } ) - cmd = h._generate_primus_command(1, 4, 1234, "scripts/primus_pretrain/run.sh", "") + cmd = h._generate_primus_command( + 1, 4, 1234, "scripts/primus_pretrain/run.sh", "" + ) assert 'export BACKEND="MaxText"' in cmd assert "PRIMUS_CONFIG_PATH=" in cmd @@ -80,7 +83,9 @@ def test_multi_node_service_dns(self): h = _PrimusCommandHarness( {"distributed": {"primus": {}}}, job_name="madengine-j", namespace="ns1" ) - cmd = h._generate_primus_command(2, 8, 1234, "scripts/primus_pretrain/run.sh", "") + cmd = h._generate_primus_command( + 2, 8, 1234, "scripts/primus_pretrain/run.sh", "" + ) assert "madengine-j-0.madengine-j.ns1.svc.cluster.local" in cmd assert "JOB_COMPLETION_INDEX" in cmd assert "NNODES=2" in cmd @@ -94,7 +99,9 @@ def test_multi_node_master_dns_uses_short_headless_subdomain(self): {"distributed": {"primus": {}}}, job_name=long_job, namespace="ns1" ) h.service_name = sub - cmd = h._generate_primus_command(2, 8, 1234, "scripts/primus_pretrain/run.sh", "") + cmd = h._generate_primus_command( + 2, 8, 1234, "scripts/primus_pretrain/run.sh", "" + ) assert f"{long_job}-0.{sub}.ns1.svc.cluster.local" in cmd diff --git a/tests/unit/test_reporting.py b/tests/unit/test_reporting.py index 844137e9..799e887f 100644 --- a/tests/unit/test_reporting.py +++ b/tests/unit/test_reporting.py @@ -1,15 +1,12 @@ """Unit tests for reporting: update_perf_csv and PERF_CSV_HEADER.""" +import json import os import tempfile -import json import pandas as pd -from madengine.reporting.update_perf_csv import ( - PERF_CSV_HEADER, - update_perf_csv, -) +from madengine.reporting.update_perf_csv import PERF_CSV_HEADER, update_perf_csv class TestPerfCsvHeader: diff --git a/tests/unit/test_reporting_superset.py b/tests/unit/test_reporting_superset.py index 622c9b75..a7bfe808 100644 --- a/tests/unit/test_reporting_superset.py +++ b/tests/unit/test_reporting_superset.py @@ -9,14 +9,17 @@ Copyright (c) Advanced Micro Devices, Inc. All rights reserved. """ + # built-in modules import os import json import tempfile import shutil + # 3rd party modules import pytest import pandas as pd + # project modules from madengine.utils.config_parser import ConfigParser from madengine.reporting.update_perf_super import ( @@ -36,126 +39,111 @@ def tmp_dir(self): yield temp_dir if os.path.exists(temp_dir): shutil.rmtree(temp_dir) - + @pytest.fixture def fixtures_dir(self): """Get path to dummy fixtures directory.""" return os.path.join( - os.path.dirname(__file__), - '..', - 'fixtures', - 'dummy', - 'scripts', - 'dummy' + os.path.dirname(__file__), "..", "fixtures", "dummy", "scripts", "dummy" ) - + @pytest.fixture def config_file(self, fixtures_dir): """Get path to config file.""" - return os.path.join(fixtures_dir, 'configs', 'default.csv') - + return os.path.join(fixtures_dir, "configs", "default.csv") + def test_config_file_exists(self, config_file): """Test that the dummy config file exists.""" - assert os.path.exists(config_file), \ - f"Config file should exist at {config_file}" - + assert os.path.exists(config_file), f"Config file should exist at {config_file}" + def test_config_parser_loads_csv(self, config_file): """Test that ConfigParser can load the dummy CSV config.""" parser = ConfigParser() configs = parser.load_config_file(config_file) - + assert configs is not None, "Configs should not be None" assert isinstance(configs, list), "Configs should be a list" assert len(configs) == 3, "Should have 3 config rows" - + # Check first config has expected fields first_config = configs[0] - assert 'model' in first_config - assert 'benchmark' in first_config - assert 'config_value' in first_config - assert 'batch_size' in first_config - assert 'datatype' in first_config - assert 'max_tokens' in first_config - + assert "model" in first_config + assert "benchmark" in first_config + assert "config_value" in first_config + assert "batch_size" in first_config + assert "datatype" in first_config + assert "max_tokens" in first_config + # Verify values - assert first_config['model'] == 'dummy/model-1' - assert first_config['benchmark'] == 'throughput' - assert first_config['datatype'] == 'float16' - assert first_config['batch_size'] == 8 - assert first_config['config_value'] == 128 - assert first_config['max_tokens'] == 1024 - + assert first_config["model"] == "dummy/model-1" + assert first_config["benchmark"] == "throughput" + assert first_config["datatype"] == "float16" + assert first_config["batch_size"] == 8 + assert first_config["config_value"] == 128 + assert first_config["max_tokens"] == 1024 + def test_config_parser_from_args(self, fixtures_dir): """Test parsing config path from args string.""" parser = ConfigParser(scripts_base_dir=fixtures_dir) args_string = "--config configs/default.csv" - + config_path = parser.parse_config_from_args( - args_string, - os.path.join(fixtures_dir, 'run.sh') + args_string, os.path.join(fixtures_dir, "run.sh") ) - + assert config_path is not None, "Config path should be found" - assert os.path.exists(config_path), \ - f"Config file should exist at {config_path}" - + assert os.path.exists(config_path), f"Config file should exist at {config_path}" + def test_config_parser_parse_and_load(self, fixtures_dir): """Test parse_and_load convenience method.""" parser = ConfigParser(scripts_base_dir=fixtures_dir) args_string = "--batch-size 32 --config configs/default.csv" - + configs = parser.parse_and_load(args_string, fixtures_dir) - + assert configs is not None, "Configs should be loaded" assert isinstance(configs, list), "Configs should be a list" assert len(configs) == 3, "Should have 3 config rows" - + def test_config_parser_no_config_arg(self, fixtures_dir): """Test handling when no --config argument is present.""" parser = ConfigParser(scripts_base_dir=fixtures_dir) args_string = "--batch-size 32 --epochs 10" - + configs = parser.parse_and_load(args_string, fixtures_dir) - + assert configs is None, "Should return None when no config argument" - + def test_config_parser_match_config_to_result(self, config_file): """Test matching configs to results.""" parser = ConfigParser() configs = parser.load_config_file(config_file) - + # Test matching with model name - result_data = { - 'model': 'dummy/model-1', - 'benchmark': 'throughput' - } - - matched = parser.match_config_to_result(configs, result_data, 'dummy/model-1') - + result_data = {"model": "dummy/model-1", "benchmark": "throughput"} + + matched = parser.match_config_to_result(configs, result_data, "dummy/model-1") + assert matched is not None, "Should match a config" - assert matched['model'] == 'dummy/model-1' - assert matched['benchmark'] == 'throughput' - + assert matched["model"] == "dummy/model-1" + assert matched["benchmark"] == "throughput" + def test_config_parser_json_file(self, tmp_dir): """Test loading JSON config file.""" # Create a JSON config file - json_config = { - "batch_size": 32, - "learning_rate": 0.001, - "epochs": 10 - } - + json_config = {"batch_size": 32, "learning_rate": 0.001, "epochs": 10} + json_path = os.path.join(tmp_dir, "config.json") - with open(json_path, 'w') as f: + with open(json_path, "w") as f: json.dump(json_config, f) parser = ConfigParser() configs = parser.load_config_file(json_path) - + assert configs is not None, "Configs should be loaded" assert isinstance(configs, dict), "JSON config should be a dict" - assert configs['batch_size'] == 32 - assert configs['learning_rate'] == 0.001 + assert configs["batch_size"] == 32 + assert configs["learning_rate"] == 0.001 class TestPerfEntrySuperGeneration: @@ -168,19 +156,14 @@ def tmp_dir(self): yield temp_dir if os.path.exists(temp_dir): shutil.rmtree(temp_dir) - + @pytest.fixture def fixtures_dir(self): """Get path to dummy fixtures directory.""" return os.path.join( - os.path.dirname(__file__), - '..', - 'fixtures', - 'dummy', - 'scripts', - 'dummy' + os.path.dirname(__file__), "..", "fixtures", "dummy", "scripts", "dummy" ) - + def test_perf_entry_super_json_structure(self, tmp_dir, fixtures_dir): """Test that perf_super.json has the correct structure.""" # Create mock data @@ -214,12 +197,12 @@ def test_perf_entry_super_json_structure(self, tmp_dir, fixtures_dir): # Create common_info.json common_info_path = os.path.join(tmp_dir, "common_info.json") - with open(common_info_path, 'w') as f: + with open(common_info_path, "w") as f: json.dump(common_info, f) - + # Create results CSV results_csv = os.path.join(tmp_dir, "perf_dummy_super.csv") - with open(results_csv, 'w') as f: + with open(results_csv, "w") as f: f.write("model,performance,metric,status\n") f.write("dummy/model-1,1234.56,tokens/s,SUCCESS\n") f.write("dummy/model-2,2345.67,requests/s,SUCCESS\n") @@ -227,54 +210,61 @@ def test_perf_entry_super_json_structure(self, tmp_dir, fixtures_dir): # Generate perf_super.json (cumulative) perf_super_path = os.path.join(tmp_dir, "perf_super.json") - + update_perf_super_json( perf_super_json=perf_super_path, multiple_results=results_csv, common_info=common_info_path, model_name="dummy_perf_super", - scripts_base_dir=fixtures_dir + scripts_base_dir=fixtures_dir, ) - + # Verify file was created - assert os.path.exists(perf_super_path), \ - "perf_super.json should be created" - + assert os.path.exists(perf_super_path), "perf_super.json should be created" + # Load and verify structure - with open(perf_super_path, 'r') as f: + with open(perf_super_path, "r") as f: data = json.load(f) - + assert isinstance(data, list), "Data should be a list" assert len(data) == 3, "Should have 3 result records" - + # Check first record structure first_record = data[0] - + # Verify all common fields are present required_fields = [ - 'model', 'performance', 'metric', 'status', 'pipeline', - 'n_gpus', 'args', 'tags', 'gpu_architecture' + "model", + "performance", + "metric", + "status", + "pipeline", + "n_gpus", + "args", + "tags", + "gpu_architecture", ] for field in required_fields: assert field in first_record, f"Field '{field}' should be present" - + # Verify configs field is present - assert 'configs' in first_record, "configs field should be present" - + assert "configs" in first_record, "configs field should be present" + # Verify configs is not None (config file was found and loaded) - assert first_record['configs'] is not None, \ - "configs should not be None when config file exists" - + assert ( + first_record["configs"] is not None + ), "configs should not be None when config file exists" + # Verify configs has expected structure - configs = first_record['configs'] + configs = first_record["configs"] assert isinstance(configs, dict), "configs should be a dict" - assert 'model' in configs - assert 'benchmark' in configs - assert 'config_value' in configs - assert 'batch_size' in configs - assert 'datatype' in configs - assert 'max_tokens' in configs - + assert "model" in configs + assert "benchmark" in configs + assert "config_value" in configs + assert "batch_size" in configs + assert "datatype" in configs + assert "max_tokens" in configs + def test_perf_entry_super_config_matching(self, tmp_dir, fixtures_dir): """Test that configs are correctly matched for all results.""" # Create mock data @@ -305,14 +295,14 @@ def test_perf_entry_super_config_matching(self, tmp_dir, fixtures_dir): "build_number": "", "additional_docker_run_options": "", } - + common_info_path = os.path.join(tmp_dir, "common_info_super.json") - with open(common_info_path, 'w') as f: + with open(common_info_path, "w") as f: json.dump(common_info, f) # Create results CSV results_csv = os.path.join(tmp_dir, "perf_dummy_super.csv") - with open(results_csv, 'w') as f: + with open(results_csv, "w") as f: f.write("model,performance,metric,benchmark\n") f.write("dummy/model-1,1234.56,tokens/s,throughput\n") f.write("dummy/model-2,2345.67,requests/s,serving\n") @@ -325,33 +315,33 @@ def test_perf_entry_super_config_matching(self, tmp_dir, fixtures_dir): multiple_results=results_csv, common_info=common_info_path, model_name="dummy_perf_super", - scripts_base_dir=fixtures_dir + scripts_base_dir=fixtures_dir, ) # Load and verify matching - with open(perf_super_path, 'r') as f: + with open(perf_super_path, "r") as f: data = json.load(f) - + # Verify each result has configs assert len(data) == 3, "Should have 3 results" - + for record in data: - configs = record.get('configs') + configs = record.get("configs") assert configs is not None, "Each record should have configs" assert isinstance(configs, dict), "Configs should be a dict" - + # Verify configs have expected structure (from default.csv) - assert 'model' in configs - assert 'benchmark' in configs - assert 'config_value' in configs - assert 'batch_size' in configs - assert 'datatype' in configs - assert 'max_tokens' in configs - + assert "model" in configs + assert "benchmark" in configs + assert "config_value" in configs + assert "batch_size" in configs + assert "datatype" in configs + assert "max_tokens" in configs + # Verify configs values are from our config file - assert configs['benchmark'] in ['throughput', 'serving', 'latency'] - assert configs['datatype'] in ['float16', 'float32', 'bfloat16'] - + assert configs["benchmark"] in ["throughput", "serving", "latency"] + assert configs["datatype"] in ["float16", "float32", "bfloat16"] + def test_perf_entry_super_no_config(self, tmp_dir, fixtures_dir): """Test handling when no config file is specified.""" # Create mock data without config @@ -382,37 +372,38 @@ def test_perf_entry_super_no_config(self, tmp_dir, fixtures_dir): "build_number": "", "additional_docker_run_options": "", } - + common_info_path = os.path.join(tmp_dir, "common_info_super.json") - with open(common_info_path, 'w') as f: + with open(common_info_path, "w") as f: json.dump(common_info, f) # Create results CSV results_csv = os.path.join(tmp_dir, "perf_dummy_super.csv") - with open(results_csv, 'w') as f: + with open(results_csv, "w") as f: f.write("model,performance,metric\n") f.write("dummy-no-config,1234.56,tokens/s\n") perf_super_path = os.path.join(tmp_dir, "perf_super.json") - + update_perf_super_json( perf_super_json=perf_super_path, multiple_results=results_csv, common_info=common_info_path, model_name="dummy_no_config", - scripts_base_dir=fixtures_dir + scripts_base_dir=fixtures_dir, ) - + # Load and verify - with open(perf_super_path, 'r') as f: + with open(perf_super_path, "r") as f: data = json.load(f) - + assert len(data) == 1, "Should have 1 result" - + # Verify configs is None when no config file - assert data[0]['configs'] is None, \ - "configs should be None when no config file specified" - + assert ( + data[0]["configs"] is None + ), "configs should be None when no config file specified" + def test_perf_entry_super_multi_results(self, tmp_dir, fixtures_dir): """Test handling of multiple result metrics.""" common_info = { @@ -442,18 +433,20 @@ def test_perf_entry_super_multi_results(self, tmp_dir, fixtures_dir): "build_number": "", "additional_docker_run_options": "", } - + common_info_path = os.path.join(tmp_dir, "common_info_super.json") - with open(common_info_path, 'w') as f: + with open(common_info_path, "w") as f: json.dump(common_info, f) # Create results CSV with extra metrics results_csv = os.path.join(tmp_dir, "perf_multi_metrics.csv") - with open(results_csv, 'w') as f: - f.write("model,performance,metric,throughput,latency_mean_ms,latency_p50_ms,latency_p90_ms,gpu_memory_used_mb\n") + with open(results_csv, "w") as f: + f.write( + "model,performance,metric,throughput,latency_mean_ms,latency_p50_ms,latency_p90_ms,gpu_memory_used_mb\n" + ) f.write("model-1,1234.56,tokens/s,1234.56,8.1,7.9,12.3,12288\n") f.write("model-2,2345.67,requests/s,2345.67,4.3,4.1,6.8,16384\n") - + perf_super_path = os.path.join(tmp_dir, "perf_super.json") update_perf_super_json( @@ -461,35 +454,37 @@ def test_perf_entry_super_multi_results(self, tmp_dir, fixtures_dir): multiple_results=results_csv, common_info=common_info_path, model_name="test_multi_metrics", - scripts_base_dir=fixtures_dir + scripts_base_dir=fixtures_dir, ) - + # Load and verify - with open(perf_super_path, 'r') as f: + with open(perf_super_path, "r") as f: data = json.load(f) - + assert len(data) == 2, "Should have 2 results" - + # Check first result has multi_results with extra metrics first_result = data[0] - assert 'multi_results' in first_result, "Should have multi_results field" - assert first_result['multi_results'] is not None, "multi_results should not be None" - - multi_results = first_result['multi_results'] + assert "multi_results" in first_result, "Should have multi_results field" + assert ( + first_result["multi_results"] is not None + ), "multi_results should not be None" + + multi_results = first_result["multi_results"] assert isinstance(multi_results, dict), "multi_results should be a dict" - + # Verify extra metrics are in multi_results - assert 'throughput' in multi_results - assert 'latency_mean_ms' in multi_results - assert 'latency_p50_ms' in multi_results - assert 'latency_p90_ms' in multi_results - assert 'gpu_memory_used_mb' in multi_results - + assert "throughput" in multi_results + assert "latency_mean_ms" in multi_results + assert "latency_p50_ms" in multi_results + assert "latency_p90_ms" in multi_results + assert "gpu_memory_used_mb" in multi_results + # Verify values - assert multi_results['throughput'] == 1234.56 - assert multi_results['latency_mean_ms'] == 8.1 - assert multi_results['gpu_memory_used_mb'] == 12288 - + assert multi_results["throughput"] == 1234.56 + assert multi_results["latency_mean_ms"] == 8.1 + assert multi_results["gpu_memory_used_mb"] == 12288 + def test_perf_entry_super_deployment_fields(self, tmp_dir, fixtures_dir): """Test that all deployment-related fields are present.""" common_info = { @@ -519,17 +514,17 @@ def test_perf_entry_super_deployment_fields(self, tmp_dir, fixtures_dir): "build_number": "", "additional_docker_run_options": "", } - + common_info_path = os.path.join(tmp_dir, "common_info_super.json") - with open(common_info_path, 'w') as f: + with open(common_info_path, "w") as f: json.dump(common_info, f) # Create results CSV results_csv = os.path.join(tmp_dir, "perf_deployment.csv") - with open(results_csv, 'w') as f: + with open(results_csv, "w") as f: f.write("model,performance,metric\n") f.write("multi-node-test,5000.0,tokens/s\n") - + perf_super_path = os.path.join(tmp_dir, "perf_super.json") update_perf_super_json( @@ -537,17 +532,17 @@ def test_perf_entry_super_deployment_fields(self, tmp_dir, fixtures_dir): multiple_results=results_csv, common_info=common_info_path, model_name="test_deployment", - scripts_base_dir=fixtures_dir + scripts_base_dir=fixtures_dir, ) - + # Load and verify - with open(perf_super_path, 'r') as f: + with open(perf_super_path, "r") as f: data = json.load(f) - + assert len(data) == 1, "Should have 1 result" - + result = data[0] - + # Verify all deployment fields are present deployment_fields = { "n_gpus": "16", @@ -557,11 +552,12 @@ def test_perf_entry_super_deployment_fields(self, tmp_dir, fixtures_dir): "launcher": "torchrun", "machine_name": "node-1", } - + for field, expected_value in deployment_fields.items(): assert field in result, f"Field '{field}' should be present" - assert result[field] == expected_value, \ - f"Field '{field}' should be '{expected_value}', got '{result[field]}'" + assert ( + result[field] == expected_value + ), f"Field '{field}' should be '{expected_value}', got '{result[field]}'" class TestPerfSuperCSVGeneration: @@ -596,52 +592,51 @@ def test_csv_generation_from_json(self, tmp_dir): "status": "SUCCESS", "configs": {"batch_size": 64, "learning_rate": 0.002}, "multi_results": None, - } + }, ] - + json_path = os.path.join(tmp_dir, "perf_super.json") - with open(json_path, 'w') as f: + with open(json_path, "w") as f: json.dump(data, f) # Change to test directory original_dir = os.getcwd() os.chdir(tmp_dir) - + try: # Generate CSVs update_perf_super_csv( - perf_super_json="perf_super.json", - perf_super_csv="perf_super.csv" + perf_super_json="perf_super.json", perf_super_csv="perf_super.csv" ) - + # Verify files exist - assert os.path.exists("perf_entry_super.csv"), \ - "perf_entry_super.csv should be created" - assert os.path.exists("perf_super.csv"), \ - "perf_super.csv should be created" - + assert os.path.exists( + "perf_entry_super.csv" + ), "perf_entry_super.csv should be created" + assert os.path.exists("perf_super.csv"), "perf_super.csv should be created" + # Load and verify perf_entry_super.csv (latest entry only) entry_df = pd.read_csv("perf_entry_super.csv") assert len(entry_df) == 1, "Should have 1 entry (latest)" - assert entry_df.iloc[0]['model'] == "test_model_2" - + assert entry_df.iloc[0]["model"] == "test_model_2" + # Load and verify perf_super.csv (all entries) super_df = pd.read_csv("perf_super.csv") assert len(super_df) == 2, "Should have 2 entries (all)" - + # Verify configs column is JSON string - assert 'configs' in super_df.columns - first_configs = json.loads(super_df.iloc[0]['configs']) - assert first_configs['batch_size'] == 32 - + assert "configs" in super_df.columns + first_configs = json.loads(super_df.iloc[0]["configs"]) + assert first_configs["batch_size"] == 32 + # Verify multi_results column - assert 'multi_results' in super_df.columns - first_multi = json.loads(super_df.iloc[0]['multi_results']) - assert first_multi['throughput'] == 1234.56 - + assert "multi_results" in super_df.columns + first_multi = json.loads(super_df.iloc[0]["multi_results"]) + assert first_multi["throughput"] == 1234.56 + finally: os.chdir(original_dir) - + def test_csv_handles_none_values(self, tmp_dir): """Test that CSV generation handles None values correctly.""" data = [ @@ -655,31 +650,33 @@ def test_csv_handles_none_values(self, tmp_dir): ] json_path = os.path.join(tmp_dir, "perf_super.json") - with open(json_path, 'w') as f: + with open(json_path, "w") as f: json.dump(data, f) original_dir = os.getcwd() os.chdir(tmp_dir) - + try: update_perf_super_csv( - perf_super_json="perf_super.json", - perf_super_csv="perf_super.csv" + perf_super_json="perf_super.json", perf_super_csv="perf_super.csv" ) - + # Load CSV df = pd.read_csv("perf_super.csv") - + # Verify None values are handled - assert pd.isna(df.iloc[0]['configs']) or df.iloc[0]['configs'] == '' - assert pd.isna(df.iloc[0]['multi_results']) or df.iloc[0]['multi_results'] == '' - + assert pd.isna(df.iloc[0]["configs"]) or df.iloc[0]["configs"] == "" + assert ( + pd.isna(df.iloc[0]["multi_results"]) + or df.iloc[0]["multi_results"] == "" + ) + finally: os.chdir(original_dir) - + def test_csv_multiple_entries_in_entry_file(self, tmp_dir): """Test that perf_entry_super.csv can contain multiple entries from current run. - + This tests the fix for the issue where perf_entry.csv and perf_entry.json had 4 entries (for multiple results) but perf_entry_super.csv only had 1. Now perf_entry_super.csv should contain all entries from the current run. @@ -732,46 +729,54 @@ def test_csv_multiple_entries_in_entry_file(self, tmp_dir): "status": "SUCCESS", "configs": None, "multi_results": {"temperature": 45678}, - } + }, ] - + json_path = os.path.join(tmp_dir, "perf_super.json") - with open(json_path, 'w') as f: + with open(json_path, "w") as f: json.dump(data, f) original_dir = os.getcwd() os.chdir(tmp_dir) - + try: # Generate CSVs with num_entries=4 (simulating 4 entries added in current run) update_perf_super_csv( perf_super_json="perf_super.json", perf_super_csv="perf_super.csv", - num_entries=4 + num_entries=4, ) - + # Verify perf_entry_super.csv has ALL 4 entries from current run entry_df = pd.read_csv("perf_entry_super.csv") - assert len(entry_df) == 4, \ - f"perf_entry_super.csv should have 4 entries, got {len(entry_df)}" - + assert ( + len(entry_df) == 4 + ), f"perf_entry_super.csv should have 4 entries, got {len(entry_df)}" + # Verify the models are the 4 from the current run (not the old one) - models = entry_df['model'].tolist() - expected_models = ['dummy_multi_1', 'dummy_multi_2', 'dummy_multi_3', 'dummy_multi_4'] - assert models == expected_models, \ - f"Expected {expected_models}, got {models}" - + models = entry_df["model"].tolist() + expected_models = [ + "dummy_multi_1", + "dummy_multi_2", + "dummy_multi_3", + "dummy_multi_4", + ] + assert ( + models == expected_models + ), f"Expected {expected_models}, got {models}" + # Verify perf_super.csv has ALL 5 entries (old + new) super_df = pd.read_csv("perf_super.csv") - assert len(super_df) == 5, \ - f"perf_super.csv should have 5 entries (1 old + 4 new), got {len(super_df)}" - + assert ( + len(super_df) == 5 + ), f"perf_super.csv should have 5 entries (1 old + 4 new), got {len(super_df)}" + # Verify all models are in perf_super.csv - all_models = super_df['model'].tolist() - assert 'old_model' in all_models, "Old model should be in perf_super.csv" - assert all(m in all_models for m in expected_models), \ - "All new models should be in perf_super.csv" - + all_models = super_df["model"].tolist() + assert "old_model" in all_models, "Old model should be in perf_super.csv" + assert all( + m in all_models for m in expected_models + ), "All new models should be in perf_super.csv" + finally: os.chdir(original_dir) - diff --git a/tests/unit/test_rocm_path.py b/tests/unit/test_rocm_path.py index 2f2cbf7a..9f5cb41d 100644 --- a/tests/unit/test_rocm_path.py +++ b/tests/unit/test_rocm_path.py @@ -5,6 +5,7 @@ """ import os + import pytest from madengine.core.constants import get_rocm_path @@ -70,18 +71,25 @@ def test_context_build_only_mad_rocm_path(self): def test_context_runtime_includes_rocm_path_in_ctx(self): """Context stores host rocm_path; in-container ROCM_PATH is set at run time.""" - from madengine.core.context import Context from unittest.mock import patch + + from madengine.core.context import Context from madengine.utils.rocm_path_resolver import normalize_rocm_path ac = repr({MAD_ROCM_PATH: "/my/rocm"}) - with patch.object(Context, "get_gpu_vendor", return_value="AMD"), \ - patch.object(Context, "get_system_ngpus", return_value=2), \ - patch.object(Context, "get_system_gpu_architecture", return_value="gfx90a"), \ - patch.object(Context, "get_system_gpu_product_name", return_value="MI250"), \ - patch.object(Context, "get_system_hip_version", return_value="5.4"), \ - patch.object(Context, "get_docker_gpus", return_value="0-1"), \ - patch.object(Context, "get_gpu_renderD_nodes", return_value=None): + with patch.object(Context, "get_gpu_vendor", return_value="AMD"), patch.object( + Context, "get_system_ngpus", return_value=2 + ), patch.object( + Context, "get_system_gpu_architecture", return_value="gfx90a" + ), patch.object( + Context, "get_system_gpu_product_name", return_value="MI250" + ), patch.object( + Context, "get_system_hip_version", return_value="5.4" + ), patch.object( + Context, "get_docker_gpus", return_value="0-1" + ), patch.object( + Context, "get_gpu_renderD_nodes", return_value=None + ): ctx = Context(additional_context=ac) exp = normalize_rocm_path("/my/rocm") assert ctx.ctx.get("rocm_path") == exp @@ -90,8 +98,9 @@ def test_context_runtime_includes_rocm_path_in_ctx(self): def test_context_container_rocm_path_preserved_at_init(self): """docker_env_vars.ROCM_PATH is preserved at context init; finalize normalizes at run time.""" - from madengine.core.context import Context from unittest.mock import patch + + from madengine.core.context import Context from madengine.utils.rocm_path_resolver import normalize_rocm_path ac = repr( @@ -100,13 +109,19 @@ def test_context_container_rocm_path_preserved_at_init(self): "docker_env_vars": {"ROCM_PATH": "/in/image"}, } ) - with patch.object(Context, "get_gpu_vendor", return_value="AMD"), \ - patch.object(Context, "get_system_ngpus", return_value=2), \ - patch.object(Context, "get_system_gpu_architecture", return_value="gfx90a"), \ - patch.object(Context, "get_system_gpu_product_name", return_value="MI250"), \ - patch.object(Context, "get_system_hip_version", return_value="5.4"), \ - patch.object(Context, "get_docker_gpus", return_value="0-1"), \ - patch.object(Context, "get_gpu_renderD_nodes", return_value=None): + with patch.object(Context, "get_gpu_vendor", return_value="AMD"), patch.object( + Context, "get_system_ngpus", return_value=2 + ), patch.object( + Context, "get_system_gpu_architecture", return_value="gfx90a" + ), patch.object( + Context, "get_system_gpu_product_name", return_value="MI250" + ), patch.object( + Context, "get_system_hip_version", return_value="5.4" + ), patch.object( + Context, "get_docker_gpus", return_value="0-1" + ), patch.object( + Context, "get_gpu_renderD_nodes", return_value=None + ): ctx = Context(additional_context=ac) assert ctx._rocm_path == normalize_rocm_path("/on/host") # User-supplied ROCM_PATH is kept in docker_env_vars at init; finalize normalizes at run time. @@ -257,9 +272,7 @@ def test_rocm_root_from_bin_tool_amd_smi(self, tmp_path): smi.chmod(0o755) assert _rocm_root_from_bin_tool(str(smi.resolve())) == root.resolve() - def test_auto_detect_finds_injected_versioned_opt_rocm( - self, monkeypatch, tmp_path - ): + def test_auto_detect_finds_injected_versioned_opt_rocm(self, monkeypatch, tmp_path): """Simulate /opt/rocm-7.13.0 without depending on the host /opt tree.""" vroot = (tmp_path / "rocm-7.13.0").resolve() (vroot / "bin").mkdir(parents=True) @@ -287,15 +300,11 @@ def merged_looks(p): return real_looks(p) monkeypatch.setattr(rpr, "_looks_like_rocm_root", merged_looks) - monkeypatch.setattr( - rpr, "_versioned_opt_rocm_dirs", lambda: [vroot] - ) + monkeypatch.setattr(rpr, "_versioned_opt_rocm_dirs", lambda: [vroot]) out = auto_detect_rocm_path() assert out == normalize_rocm_path(str(vroot)) - def test_infer_root_from_path_tools_amd_smi( - self, monkeypatch, tmp_path - ): + def test_infer_root_from_path_tools_amd_smi(self, monkeypatch, tmp_path): """`which(amd-smi)` → .../rocm-7.13.0/bin/amd-smi` yields root with both smi tools.""" vroot = (tmp_path / "rocm-7.13.0").resolve() (vroot / "bin").mkdir(parents=True) diff --git a/tests/unit/test_therock_markers.py b/tests/unit/test_therock_markers.py index c819562e..eb01295a 100644 --- a/tests/unit/test_therock_markers.py +++ b/tests/unit/test_therock_markers.py @@ -2,9 +2,10 @@ Unit tests for TheRock on-disk marker helpers. """ -import pytest from pathlib import Path +import pytest + from madengine.utils.therock_markers import ( is_therock_tree, therock_dist_info_path, @@ -28,5 +29,8 @@ def test_is_therock_tree_false_without_markers(tmp_path: Path) -> None: @pytest.mark.unit def test_relpath_helpers_match_expected_layout() -> None: root = Path("/opt/rocm-7.0.0") - assert therock_manifest_path(root) == root / "share" / "therock" / "therock_manifest.json" + assert ( + therock_manifest_path(root) + == root / "share" / "therock" / "therock_manifest.json" + ) assert therock_dist_info_path(root) == root / "share" / "therock" / "dist_info.json" diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index 2e09b8c3..2ac5633c 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -6,16 +6,16 @@ import pytest -from madengine.utils.path_utils import scripts_base_dir_from, get_madengine_root +from madengine.utils.path_utils import get_madengine_root, scripts_base_dir_from from madengine.utils.run_details import ( - get_pipeline, - get_build_number, flatten_tags_in_place, + get_build_number, + get_pipeline, ) - # ---- path_utils ---- + class TestScriptsBaseDirFrom: """Test scripts_base_dir_from helper.""" @@ -51,8 +51,11 @@ def test_returns_madengine_package_path(self): # ---- run_details ---- + class TestGetPipeline: - @pytest.mark.parametrize("env_val,expected", [({}, ""), ({"pipeline": "ci-mad"}, "ci-mad")]) + @pytest.mark.parametrize( + "env_val,expected", [({}, ""), ({"pipeline": "ci-mad"}, "ci-mad")] + ) def test_pipeline_from_env_or_default(self, env_val, expected): with patch.dict(os.environ, env_val, clear=False): if not env_val and "pipeline" in os.environ: @@ -61,7 +64,9 @@ def test_pipeline_from_env_or_default(self, env_val, expected): class TestGetBuildNumber: - @pytest.mark.parametrize("env_val,expected", [({}, "0"), ({"BUILD_NUMBER": "42"}, "42")]) + @pytest.mark.parametrize( + "env_val,expected", [({}, "0"), ({"BUILD_NUMBER": "42"}, "42")] + ) def test_build_number_from_env_or_default(self, env_val, expected): with patch.dict(os.environ, env_val, clear=False): if not env_val and "BUILD_NUMBER" in os.environ: diff --git a/tests/unit/test_validators.py b/tests/unit/test_validators.py index 6a78c52d..4abf7c48 100644 --- a/tests/unit/test_validators.py +++ b/tests/unit/test_validators.py @@ -11,12 +11,12 @@ import pytest import typer +from madengine.cli.constants import ExitCode +from madengine.cli.validators import validate_additional_context from madengine.core.additional_context_defaults import ( DEFAULT_GPU_VENDOR, DEFAULT_GUEST_OS, ) -from madengine.cli.validators import validate_additional_context -from madengine.cli.constants import ExitCode class TestValidateAdditionalContext: