From e5205e4fd975a4a03f8440a7c58c4a171815477d Mon Sep 17 00:00:00 2001 From: Andy Chang Date: Wed, 18 Mar 2026 22:11:38 -0500 Subject: [PATCH 1/2] Update Docker environment for MI355X (gfx950) benchmark reproducibility - Update base image from ROCm 6.4 to ROCm 7.1.1 (PyTorch 2.10.0) to support MI355X gfx950 architecture - Add Python dependencies: amd-aiter (INT8 GEMM overlap tests), ninja, prettytable, pytest-assume, setuptools-scm - Add MORI build step: COPY source, compile with MORI_GPU_ARCHS=gfx950, use --no-build-isolation to leverage base image PyTorch - Set required environment variables: HSA_NO_SCRATCH_RECLAIM=1, PYTHONPATH - Add .dockerignore to exclude .git/, build/, *.csv (reduces build context from 235MB to ~19MB) - Add docker/run_benchmark.sh helper script with GPU passthrough flags - Update README with Docker usage, benchmark commands, and non-Docker installation instructions --- .dockerignore | 11 ++++++++ README.md | 60 +++++++++++++++++++++++++++++++++++------ docker/run_benchmark.sh | 15 +++++++++++ 3 files changed, 78 insertions(+), 8 deletions(-) create mode 100644 .dockerignore create mode 100755 docker/run_benchmark.sh diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 00000000..971059d3 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,11 @@ +.git +build +__pycache__ +*.pyc +*.pyo +*.egg-info +*.egg +dist +*.csv +*.jpg +*.png diff --git a/README.md b/README.md index c646c6bf..8495eb5e 100644 --- a/README.md +++ b/README.md @@ -96,20 +96,64 @@ Benchmark result on the following configurations: ## Installation -### Prerequisites +### Docker (recommended for MI355X) -- pytorch:rocm >= 6.4.0 -- Linux packages: see packages in dockerfile +Build a self-contained image with all dependencies pre-installed: -Or build docker image with: +```bash +cd mori +docker build -t rocm/mori:benchmark -f docker/Dockerfile.dev . +``` + +Launch a container with GPU access: + +```bash +bash docker/run_benchmark.sh +# or manually: +docker run --rm -it \ + --device=/dev/kfd --device=/dev/dri \ + --ipc=host --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE --group-add video --group-add render \ + rocm/mori:benchmark bash ``` -cd mori && docker build -t rocm/mori:dev -f docker/Dockerfile.dev . + +Run CCL benchmarks inside the container: + +```bash +# AllReduce sweep (2-256 MB) +bash tests/python/ccl/bench_allreduce_sweep.sh + +# AllGather / ReduceScatter standalone latency sweep +bash tests/python/ccl/bench_allgather_sweep.sh +bash tests/python/ccl/bench_reducescatter_sweep.sh + +# AllGather / ReduceScatter + GEMM overlap sweep +bash tests/python/ccl/bench_ag_overlap_sweep.sh +bash tests/python/ccl/bench_rs_overlap_sweep.sh ``` -### Install with Python +> **Note**: The default `MORI_GPU_ARCHS` is `gfx950` (MI355X). To build for MI300X, pass `--build-arg MORI_GPU_ARCHS=gfx942`. + +### Install without Docker + +Prerequisites: +- PyTorch with ROCm (version must match your system ROCm, e.g. `pip install torch --index-url https://download.pytorch.org/whl/rocm7.1`) +- Linux packages: `git`, `cython3`, `ibverbs-utils`, `openmpi-bin`, `libopenmpi-dev`, `libpci-dev`, `cmake`, `libdw1`, `locales` +- For GEMM overlap tests: `pip install amd-aiter ninja` + +```bash +cd mori +pip install -r requirements-build.txt +git submodule update --init --recursive +export MORI_GPU_ARCHS=gfx950 # or gfx942 for MI300X +pip3 install . # add --no-build-isolation if using venv ``` -# NOTE: for venv build, add --no-build-isolation at the end -cd mori && pip install -r requirements-build.txt && git submodule update --init --recursive && pip3 install . + +Required environment variables: + +```bash +export PYTHONPATH=/path/to/mori:$PYTHONPATH +export HSA_NO_SCRATCH_RECLAIM=1 ``` ### Test dispatch / combine diff --git a/docker/run_benchmark.sh b/docker/run_benchmark.sh new file mode 100755 index 00000000..822154f1 --- /dev/null +++ b/docker/run_benchmark.sh @@ -0,0 +1,15 @@ +#!/bin/bash +set -e + +IMAGE_NAME="${1:-rocm/mori:benchmark}" + +docker run --rm -it \ + --device=/dev/kfd \ + --device=/dev/dri \ + --ipc=host \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --group-add video \ + --group-add render \ + "$IMAGE_NAME" \ + bash From d19f3f57ef64841ac489d3624f606c99a2e6893e Mon Sep 17 00:00:00 2001 From: Andy Chang Date: Wed, 18 Mar 2026 22:20:57 -0500 Subject: [PATCH 2/2] Update Dockerfile.dev: upgrade to ROCm 7.1.1 with full build pipeline --- docker/Dockerfile.dev | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/docker/Dockerfile.dev b/docker/Dockerfile.dev index fcd330a6..cbfdae22 100644 --- a/docker/Dockerfile.dev +++ b/docker/Dockerfile.dev @@ -1,5 +1,4 @@ -# FROM rocm/pytorch:rocm6.4.1_ubuntu22.04_py3.12_pytorch_release_2.5.1 -FROM rocm/pytorch:rocm6.4.3_ubuntu22.04_py3.10_pytorch_release_2.5.1 +FROM rocm/pytorch:rocm7.1.1_ubuntu22.04_py3.10_pytorch_release_2.10.0 RUN apt-get update && \ apt-get install -y \ @@ -11,4 +10,24 @@ RUN apt-get update && \ libpci-dev \ cmake \ libdw1 \ - locales + locales && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --no-cache-dir \ + amd-aiter ninja prettytable pytest-assume \ + setuptools-scm cmake 'setuptools>=65' + +COPY . /workspace/mori +WORKDIR /workspace/mori + +ARG MORI_GPU_ARCHS=gfx950 +ENV SETUPTOOLS_SCM_PRETEND_VERSION=0.1.0 +RUN MORI_GPU_ARCHS=${MORI_GPU_ARCHS} \ + PYTORCH_ROCM_ARCH=${MORI_GPU_ARCHS} \ + pip3 install . --no-cache-dir --no-build-isolation + +ENV PYTHONPATH=/workspace/mori:${PYTHONPATH} +ENV HSA_NO_SCRATCH_RECLAIM=1 +ENV PATH=/root/.local/bin:${PATH} + +CMD ["/bin/bash"]