ROCm · raviguptaamd · May 9, 2026 · May 9, 2026 · May 9, 2026 · May 9, 2026
@@ -5,6 +5,32 @@ All notable changes to madengine will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [Unreleased]
+
+### Added
+
+- **`slurm_multi` SLURM launcher**: New self-managed multi-node launcher for workloads that orchestrate their own per-node Docker containers via `srun` (e.g. SGLang Disaggregated proxy + prefill + decode topologies). Selected via `distributed.launcher: "slurm_multi"` (or `"slurm-multi"` alias). Generates a wrapper SBATCH script that runs the model's `.slurm` script directly on baremetal so `srun`/`scontrol` work inside it; performs parallel `srun docker pull` of the registry image on all allocated nodes when the model card sets `env_vars.DOCKER_IMAGE_NAME`. Honors model-card and `--additional-context` `slurm` fields (`partition`, `nodes`, `gpus_per_node`, `time`, `exclusive`, `reservation`, `nodelist`).
+
+- **`madengine build --use-image [IMAGE | auto]`**: Skip the local Docker build and use a pre-built image instead. With no value, resolves to the model card's `env_vars.DOCKER_IMAGE_NAME` automatically. Mutually exclusive with `--registry` and `--build-on-compute`.
+
+- **`madengine build --build-on-compute`**: Build Docker images on a SLURM compute node and push to a registry, then have `madengine run` pull the image in parallel on all allocated nodes. Requires `--registry`. The resulting manifest carries `built_on_compute: true`.
+
+- **slurm_multi build registry gate**: When `madengine build` discovers a `slurm_multi` model and no `--registry`/`--use-image`/`--build-on-compute` is given, the orchestrator either auto-uses `env_vars.DOCKER_IMAGE_NAME` from the model card (implicit `--use-image` fallback) or raises a structured `ConfigurationError` with the four supported options listed.
+
+- **bash-in-salloc execution path** for slurm_multi: when `madengine run` detects `SLURM_JOB_ID` (i.e. running inside an existing `salloc`), the slurm_multi launcher runs the generated wrapper synchronously with `bash` instead of nesting another `sbatch` job. Other launchers continue to use `sbatch` even inside `salloc` (no behavior change for non-slurm_multi).
+
+- **`DeploymentResult.skip_monitoring`** (`deployment/base.py`): new dataclass field so synchronous deploy paths (e.g. slurm_multi's bash-in-salloc) can skip the monitor poll.
+
+- **`SlurmNodeSelector` `reservation` parameter**: optional reservation name forwarded to srun health/cleanup commands so node-prep srun calls run inside the reservation.
+
+- **`tests/unit/test_slurm_multi.py`**: contract tests for `slurm_multi` registry membership, hyphen alias normalization, and end-to-end env_vars-export contract against MAD-private PR #186's `pyt_sglang_disagg_qwen3-32b_short` model card.
+
+- **`examples/slurm-configs/minimal/slurm-multi-minimal.json`**: minimal reference config for the new launcher.
+
+### Fixed
+
+- **slurm_multi: cwd `perf.csv` aggregation**: After a successful slurm_multi run, `madengine run` previously printed a cosmetic `Performance CSV not found: perf.csv` warning even though `_collect_slurm_multi_results` had ingested the per-job CSV from `/shared_inference/$USER/$JOBID/perf.csv`. The reporter (`display_performance_table`) reads cwd `perf.csv` by default. Now `_collect_slurm_multi_results` also writes the per-job rows into cwd `perf.csv` (copy if absent, append-data-rows if present) so reporting and HTML generation work without extra args. Local + classic-SLURM flows are unchanged.
+
 ## [2.0.3] - 2026-05-06
 
 ### Changed

@@ -0,0 +1,23 @@
+{
+  "_comment": "Minimal slurm_multi launcher configuration - 3 nodes minimum",
+  "_description": "Self-managed multi-node SLURM launcher (script runs on baremetal, manages its own Docker via srun)",
+  "_architecture": "Wrapper SBATCH exports env_vars and runs the model's .slurm script directly on the head node; the script orchestrates per-node containers via srun",
+
+  "gpu_vendor": "AMD",
+  "guest_os": "UBUNTU",
+  "deploy": "slurm",
+
+  "slurm": {
+    "partition": "gpu",
+    "nodes": 3,
+    "gpus_per_node": 8,
+    "time": "04:00:00",
+    "exclusive": true
+  },
+
+  "distributed": {
+    "launcher": "slurm_multi",
+    "nnodes": 3,
+    "nproc_per_node": 8
+  }
+}
@@ -55,6 +55,28 @@ def build(
             "--batch-manifest", help="Input batch.json file for batch build mode"
         ),
     ] = None,
+    # NOTE: `is_flag=False, flag_value="auto"` lets `--use-image` (no value)
+    # mean "auto-detect from the model card's DOCKER_IMAGE_NAME", matching
+    # MAD-private PR #186's documented UX. Typer is deprecating this pattern
+    # for a future release; when removed, switch to requiring an explicit
+    # value (e.g. `--use-image auto` as the documented sentinel) and update
+    # MAD-private's docs in lockstep.
+    use_image: Annotated[
+        Optional[str],
+        typer.Option(
+            "--use-image",
+            is_flag=False,
+            flag_value="auto",
+            help="Skip Docker build and use pre-built image. Optionally specify image name, or omit to auto-detect from model card's DOCKER_IMAGE_NAME"
+        ),
+    ] = None,
+    build_on_compute: Annotated[
+        bool,
+        typer.Option(
+            "--build-on-compute",
+            help="Build Docker images on SLURM compute node instead of login node"
+        ),
+    ] = False,
     additional_context: Annotated[
         str,
         typer.Option(
@@ -116,6 +138,31 @@ def build(
         )
         raise typer.Exit(ExitCode.INVALID_ARGS)
 
+    if use_image and registry:
+        console.print(
+            "❌ [bold red]Error: Cannot specify both --use-image and --registry options[/bold red]\n"
+            "[yellow]Use --use-image for pre-built external images.[/yellow]\n"
+            "[yellow]Use --registry to push locally built images.[/yellow]"
+        )
+        raise typer.Exit(ExitCode.INVALID_ARGS)
+
+    if use_image and build_on_compute:
+        console.print(
+            "❌ [bold red]Error: Cannot specify both --use-image and --build-on-compute options[/bold red]\n"
+            "[yellow]--use-image skips Docker build entirely.[/yellow]\n"
+            "[yellow]--build-on-compute builds on SLURM compute nodes.[/yellow]"
+        )
+        raise typer.Exit(ExitCode.INVALID_ARGS)
+
+    if build_on_compute and not registry:
+        console.print(
+            "❌ [bold red]Error: --build-on-compute requires --registry option[/bold red]\n"
+            "[yellow]Build on compute node pushes image to registry.[/yellow]\n"
+            "[yellow]Run phase will pull image in parallel on all nodes.[/yellow]\n"
+            "[dim]Example: --build-on-compute --registry docker.io/myorg[/dim]"
+        )
+        raise typer.Exit(ExitCode.INVALID_ARGS)
+
     # Process batch manifest if provided
     batch_data = None
     effective_tags = processed_tags
@@ -175,7 +222,7 @@ def build(
     try:
         # Validate additional context and merge file + CLI; defaults wired into orchestrator
         validated_context = validate_additional_context(
-            additional_context, additional_context_file
+            additional_context, additional_context_file, use_image
         )
 
         # Create arguments object
@@ -191,6 +238,8 @@ def build(
             verbose=verbose,
             _separate_phases=True,
             batch_build_metadata=batch_build_metadata if batch_build_metadata else None,
+            use_image=use_image,
+            build_on_compute=build_on_compute,
         )
 
         # Initialize orchestrator in build-only mode
@@ -211,6 +260,8 @@ def build(
                 clean_cache=clean_docker_cache,
                 manifest_output=manifest_output,
                 batch_build_metadata=batch_build_metadata,
+                use_image=use_image,
+                build_on_compute=build_on_compute,
             )
 
             # Load build summary for display

@@ -298,13 +298,18 @@ def additional_context_needs_cli_validation(
 def validate_additional_context(
     additional_context: str,
     additional_context_file: Optional[str] = None,
+    use_image: Optional[str] = None,
 ) -> Dict[str, Any]:
     """
     Validate and parse additional context.
 
     Args:
         additional_context: JSON string containing additional context
         additional_context_file: Optional file containing additional context
+        use_image: Pre-built image override forwarded by build.py for CLI signature
+            compatibility. Currently informational only -- validation behavior is
+            unchanged when this is set; callers wanting to skip required-field
+            checks should adjust ``finalize_additional_context_dict`` directly.
 
     Returns:
         Dict containing parsed additional context

@@ -82,6 +82,7 @@ class DeploymentResult:
     metrics: Optional[Dict[str, Any]] = None
     logs_path: Optional[str] = None
     artifacts: Optional[List[str]] = None
+    skip_monitoring: bool = False  # Set True for synchronous runs (e.g., inside salloc)
 
     @property
     def is_success(self) -> bool:
@@ -196,7 +197,8 @@ def execute(self) -> DeploymentResult:
                 return result
 
             # Step 4: Monitor (optional)
-            if self.config.monitor:
+            # Skip monitoring if deploy() already ran synchronously (e.g., inside salloc)
+            if self.config.monitor and not result.skip_monitoring:
                 result = self._monitor_until_complete(result.deployment_id)
 
             # Step 5: Collect Results (always collect, even on failure to record failed runs)

@@ -21,7 +21,8 @@
     "primus",
     "vllm",
     "sglang",
-    "sglang-disagg"
+    "sglang-disagg",
+    "slurm_multi",
 ]
 
 # Tool names that use rocprof / rocprofv3 wrapping and need MPI-aware rocprofv3 on multi-node.
@@ -62,6 +63,8 @@ def normalize_launcher(launcher_type: Optional[str], deployment_type: str) -> st
 
     Logic:
     - If launcher is in VALID_LAUNCHERS: keep as-is
+    - If launcher's hyphen/underscore variant is in VALID_LAUNCHERS: normalize
+      (e.g. "slurm-multi" -> "slurm_multi")
     - If launcher is None/empty/invalid:
         * local → "docker" (runs in Docker container)
         * slurm → "docker" (typically uses containers on compute nodes)
@@ -76,6 +79,9 @@ def normalize_launcher(launcher_type: Optional[str], deployment_type: str) -> st
     """
     if launcher_type and launcher_type in VALID_LAUNCHERS:
         return launcher_type
+    # Normalize hyphen variant: slurm-multi -> slurm_multi
+    if launcher_type and launcher_type.replace("-", "_") in VALID_LAUNCHERS:
+        return launcher_type.replace("-", "_")
     if deployment_type == "local":
         return "docker"
     if deployment_type == "slurm":