From 49c5e5ef189dfc2714aab42eb5cf3ce53d32ff89 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Thu, 7 May 2026 11:44:04 +0000 Subject: [PATCH] [ci, build] chore: revert revert of PR #3702 (restore hybridep docker move) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reverts 4f21a31d (#3729), restoring the hybridep docker install move from Dockerfile.fw_base into Dockerfile.ci that PR #3702 originally introduced. Done to retest whether the VLM perf regression suspected to be caused by the move is still reproducible on current main. Original revert message: This reverts commit 4f21a31d7c859548d62dc1b0a63a325349ce1a93. Signed-off-by: oliver könig --- docker/Dockerfile.ci | 48 +++++++++++++++++++++++++++++++++++++++ docker/Dockerfile.fw_base | 48 --------------------------------------- docker/README.md | 6 ++--- 3 files changed, 51 insertions(+), 51 deletions(-) diff --git a/docker/Dockerfile.ci b/docker/Dockerfile.ci index 819c1fb37c..7ec51d9d71 100644 --- a/docker/Dockerfile.ci +++ b/docker/Dockerfile.ci @@ -30,6 +30,54 @@ RUN curl -LsSf https://astral.sh/uv/${UV_VERSION}/install.sh | sh && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* +############################################################################## +## +## Install DeepEP and nvshmem +## +############################################################################## + +ARG INSTALL_DEEPEP=True +ARG DEEPEP_COMMIT=34152ae28f80bcc3ee38d7a12cb2ad87cfd4ea72 + +ENV HYBRID_EP_MULTINODE=1 +ENV RDMA_CORE_HOME=/opt/rdma-core/build +ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64/:$LD_LIBRARY_PATH +RUN --mount=type=bind,source=docker/patches/deepep.patch,target=/opt/deepep.patch \ + if [ "$INSTALL_DEEPEP" = "True" ]; then \ + # Upgrade system rdma-core to v60; libibverbs-dev supplies the unversioned + # libibverbs.so symlink required at link time. + apt-get update && apt-get install -y --allow-change-held-packages \ + rdma-core libibverbs-dev && \ + apt-get clean && \ + ARCH_LIB=$(dpkg-architecture -qDEB_HOST_MULTIARCH) && \ + # libmlx5-dev is not available in the apt sources; create the unversioned + # symlink from the versioned one as a fallback for the DeepEP build. + test -f /usr/lib/${ARCH_LIB}/libmlx5.so || \ + ln -sf /usr/lib/${ARCH_LIB}/libmlx5.so.1 /usr/lib/${ARCH_LIB}/libmlx5.so && \ + # Expose the system install under RDMA_CORE_HOME so DeepEP setup.py finds + # $RDMA_CORE_HOME/include and $RDMA_CORE_HOME/lib without a separate source build. + mkdir -p ${RDMA_CORE_HOME} && \ + ln -sfn /usr/include ${RDMA_CORE_HOME}/include && \ + ln -sfn /usr/lib/${ARCH_LIB} ${RDMA_CORE_HOME}/lib && \ + git clone https://github.com/deepseek-ai/DeepEP.git && \ + pushd DeepEP && \ + git pull && \ + git fetch origin $DEEPEP_COMMIT && \ + git checkout FETCH_HEAD && \ + patch -p1 < /opt/deepep.patch && \ + pip install --no-cache-dir nvidia-nvshmem-cu13==3.4.5 && \ + # Create symlink for libnvshmem_host.so (required for DeepEP fabric handle operations) + NVSHMEM_LIB_PATH=$(pip show nvidia-nvshmem-cu13 | grep "Location:" | cut -d' ' -f2)/nvidia/nvshmem/lib && \ + ln -sf ${NVSHMEM_LIB_PATH}/libnvshmem_host.so.3 ${NVSHMEM_LIB_PATH}/libnvshmem_host.so && \ + apt-get update && \ + apt-get install -y --no-install-recommends libnvidia-ml-dev && \ + TORCH_CUDA_ARCH_LIST="9.0 10.0 12.0" pip install --no-cache-dir --no-build-isolation -v . && \ + apt-get purge -y libnvidia-ml-dev && \ + apt-get autoremove -y && \ + rm -rf /var/lib/apt/lists/* && \ + popd; \ + fi + COPY pyproject.toml uv.lock /opt/Megatron-Bridge/ COPY src/megatron/bridge/__init__.py src/megatron/bridge/package_info.py /opt/Megatron-Bridge/src/megatron/bridge/ COPY 3rdparty/Megatron-LM/pyproject.toml 3rdparty/Megatron-LM/setup.py /opt/Megatron-Bridge/3rdparty/Megatron-LM/ diff --git a/docker/Dockerfile.fw_base b/docker/Dockerfile.fw_base index b81ad4cdaa..7492ee8bc9 100644 --- a/docker/Dockerfile.fw_base +++ b/docker/Dockerfile.fw_base @@ -122,54 +122,6 @@ RUN --mount=type=bind,source=docker/common/install_nsys.sh,target=/opt/install_n bash /opt/install_cublas.sh --CUBLAS_VER=$CUBLAS_VERSION; \ fi -############################################################################## -## -## Install DeepEP and nvshmem -## -############################################################################## - -ARG INSTALL_DEEPEP -ARG DEEPEP_COMMIT - -ENV HYBRID_EP_MULTINODE=1 -ENV RDMA_CORE_HOME=/opt/rdma-core/build -ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64/:$LD_LIBRARY_PATH -RUN --mount=type=bind,source=docker/patches/deepep.patch,target=/opt/deepep.patch \ - if [ "$INSTALL_DEEPEP" = "True" ]; then \ - # Upgrade system rdma-core to v60; libibverbs-dev supplies the unversioned - # libibverbs.so symlink required at link time. - apt-get update && apt-get install -y --allow-change-held-packages \ - rdma-core libibverbs-dev && \ - apt-get clean && \ - ARCH_LIB=$(dpkg-architecture -qDEB_HOST_MULTIARCH) && \ - # libmlx5-dev is not available in the apt sources; create the unversioned - # symlink from the versioned one as a fallback for the DeepEP build. - test -f /usr/lib/${ARCH_LIB}/libmlx5.so || \ - ln -sf /usr/lib/${ARCH_LIB}/libmlx5.so.1 /usr/lib/${ARCH_LIB}/libmlx5.so && \ - # Expose the system install under RDMA_CORE_HOME so DeepEP setup.py finds - # $RDMA_CORE_HOME/include and $RDMA_CORE_HOME/lib without a separate source build. - mkdir -p ${RDMA_CORE_HOME} && \ - ln -sfn /usr/include ${RDMA_CORE_HOME}/include && \ - ln -sfn /usr/lib/${ARCH_LIB} ${RDMA_CORE_HOME}/lib && \ - git clone https://github.com/deepseek-ai/DeepEP.git && \ - pushd DeepEP && \ - git pull && \ - git fetch origin $DEEPEP_COMMIT && \ - git checkout FETCH_HEAD && \ - patch -p1 < /opt/deepep.patch && \ - pip install --no-cache-dir nvidia-nvshmem-cu13==3.4.5 && \ - # Create symlink for libnvshmem_host.so (required for DeepEP fabric handle operations) - NVSHMEM_LIB_PATH=$(pip show nvidia-nvshmem-cu13 | grep "Location:" | cut -d' ' -f2)/nvidia/nvshmem/lib && \ - ln -sf ${NVSHMEM_LIB_PATH}/libnvshmem_host.so.3 ${NVSHMEM_LIB_PATH}/libnvshmem_host.so && \ - apt-get update && \ - apt-get install -y --no-install-recommends libnvidia-ml-dev && \ - TORCH_CUDA_ARCH_LIST="9.0 10.0 12.0" pip install --no-cache-dir --no-build-isolation -v . && \ - apt-get purge -y libnvidia-ml-dev && \ - apt-get autoremove -y && \ - rm -rf /var/lib/apt/lists/* && \ - popd; \ - fi - ############################################################################## ## ## Install vLLM diff --git a/docker/README.md b/docker/README.md index 686eb0d4b7..9dd225b8ef 100644 --- a/docker/README.md +++ b/docker/README.md @@ -123,8 +123,6 @@ docker build \ | `NCCL_VER` | NCCL version for the TRT-LLM install scripts | | `CUBLAS_VER` | cuBLAS version for the TRT-LLM install scripts | | `NVRTC_VER` | NVRTC version for the TRT-LLM install scripts | -| `INSTALL_DEEPEP` | Set to `True` to build and install DeepEP and nvshmem | -| `DEEPEP_COMMIT` | DeepEP git commit SHA | | `REINSTALL_NSYS` | Set to `True` to reinstall Nsight Systems from the NVIDIA apt repo | | `NSYS_VERSION` | Nsight Systems version (e.g. `2026.1.0.1085`) | | `REINSTALL_CUDNN` | Set to `True` to reinstall cuDNN from the NVIDIA apt repo | @@ -139,6 +137,8 @@ docker build \ | Argument | Description | |---|---| | `BASE_IMAGE` | Base container; set to the fw-base image when building the full stack | +| `INSTALL_DEEPEP` | Set to `True` to build and install DeepEP and nvshmem | +| `DEEPEP_COMMIT` | DeepEP git commit SHA | | `MCORE_TRIGGERED_TESTING` | Skip uv lockfile check for cross-version Megatron-LM testing | | `UV_CACHE_PRUNE_ARGS` | Extra arguments for `uv cache prune` | @@ -163,5 +163,5 @@ docker build \ | `common/install_nccl.sh` | Reinstall NCCL from the public NVIDIA CUDA apt repo | | `common/install_cudnn.sh` | Reinstall cuDNN from the public NVIDIA CUDA apt repo | | `common/install_nsys.sh` | Reinstall Nsight Systems from the public NVIDIA CUDA apt repo | -| `patches/deepep.patch` | Patch applied to DeepEP during fw-base build | +| `patches/deepep.patch` | Patch applied to DeepEP during CI image build | | `patches/vllm.patch` | Patch applied to vLLM after install in fw-base |