From c77b31ae2f1b0c6341cfdbbebe80e4ee03c6522b Mon Sep 17 00:00:00 2001 From: yaoyu-33 Date: Wed, 6 May 2026 19:46:45 -0700 Subject: [PATCH] [ci, build] chore: Revert "refactor(docker): move hybridep install from fw_base to Dockerfile.ci (#3702)" This reverts commit dec6d631f4629888356006f4839dc33fbbb7861f. L0_Launch_converter::test_generate_vlm regressed from ~1m44s to ~9m starting at this commit. Reverting to confirm whether the DeepEP install (and its env vars HYBRID_EP_MULTINODE / LD_LIBRARY_PATH) being baked into the CI image is the cause. Signed-off-by: yaoyu-33 --- docker/Dockerfile.ci | 48 --------------------------------------- docker/Dockerfile.fw_base | 48 +++++++++++++++++++++++++++++++++++++++ docker/README.md | 6 ++--- 3 files changed, 51 insertions(+), 51 deletions(-) diff --git a/docker/Dockerfile.ci b/docker/Dockerfile.ci index 7ec51d9d71..819c1fb37c 100644 --- a/docker/Dockerfile.ci +++ b/docker/Dockerfile.ci @@ -30,54 +30,6 @@ RUN curl -LsSf https://astral.sh/uv/${UV_VERSION}/install.sh | sh && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* -############################################################################## -## -## Install DeepEP and nvshmem -## -############################################################################## - -ARG INSTALL_DEEPEP=True -ARG DEEPEP_COMMIT=34152ae28f80bcc3ee38d7a12cb2ad87cfd4ea72 - -ENV HYBRID_EP_MULTINODE=1 -ENV RDMA_CORE_HOME=/opt/rdma-core/build -ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64/:$LD_LIBRARY_PATH -RUN --mount=type=bind,source=docker/patches/deepep.patch,target=/opt/deepep.patch \ - if [ "$INSTALL_DEEPEP" = "True" ]; then \ - # Upgrade system rdma-core to v60; libibverbs-dev supplies the unversioned - # libibverbs.so symlink required at link time. - apt-get update && apt-get install -y --allow-change-held-packages \ - rdma-core libibverbs-dev && \ - apt-get clean && \ - ARCH_LIB=$(dpkg-architecture -qDEB_HOST_MULTIARCH) && \ - # libmlx5-dev is not available in the apt sources; create the unversioned - # symlink from the versioned one as a fallback for the DeepEP build. - test -f /usr/lib/${ARCH_LIB}/libmlx5.so || \ - ln -sf /usr/lib/${ARCH_LIB}/libmlx5.so.1 /usr/lib/${ARCH_LIB}/libmlx5.so && \ - # Expose the system install under RDMA_CORE_HOME so DeepEP setup.py finds - # $RDMA_CORE_HOME/include and $RDMA_CORE_HOME/lib without a separate source build. - mkdir -p ${RDMA_CORE_HOME} && \ - ln -sfn /usr/include ${RDMA_CORE_HOME}/include && \ - ln -sfn /usr/lib/${ARCH_LIB} ${RDMA_CORE_HOME}/lib && \ - git clone https://github.com/deepseek-ai/DeepEP.git && \ - pushd DeepEP && \ - git pull && \ - git fetch origin $DEEPEP_COMMIT && \ - git checkout FETCH_HEAD && \ - patch -p1 < /opt/deepep.patch && \ - pip install --no-cache-dir nvidia-nvshmem-cu13==3.4.5 && \ - # Create symlink for libnvshmem_host.so (required for DeepEP fabric handle operations) - NVSHMEM_LIB_PATH=$(pip show nvidia-nvshmem-cu13 | grep "Location:" | cut -d' ' -f2)/nvidia/nvshmem/lib && \ - ln -sf ${NVSHMEM_LIB_PATH}/libnvshmem_host.so.3 ${NVSHMEM_LIB_PATH}/libnvshmem_host.so && \ - apt-get update && \ - apt-get install -y --no-install-recommends libnvidia-ml-dev && \ - TORCH_CUDA_ARCH_LIST="9.0 10.0 12.0" pip install --no-cache-dir --no-build-isolation -v . && \ - apt-get purge -y libnvidia-ml-dev && \ - apt-get autoremove -y && \ - rm -rf /var/lib/apt/lists/* && \ - popd; \ - fi - COPY pyproject.toml uv.lock /opt/Megatron-Bridge/ COPY src/megatron/bridge/__init__.py src/megatron/bridge/package_info.py /opt/Megatron-Bridge/src/megatron/bridge/ COPY 3rdparty/Megatron-LM/pyproject.toml 3rdparty/Megatron-LM/setup.py /opt/Megatron-Bridge/3rdparty/Megatron-LM/ diff --git a/docker/Dockerfile.fw_base b/docker/Dockerfile.fw_base index 7492ee8bc9..b81ad4cdaa 100644 --- a/docker/Dockerfile.fw_base +++ b/docker/Dockerfile.fw_base @@ -122,6 +122,54 @@ RUN --mount=type=bind,source=docker/common/install_nsys.sh,target=/opt/install_n bash /opt/install_cublas.sh --CUBLAS_VER=$CUBLAS_VERSION; \ fi +############################################################################## +## +## Install DeepEP and nvshmem +## +############################################################################## + +ARG INSTALL_DEEPEP +ARG DEEPEP_COMMIT + +ENV HYBRID_EP_MULTINODE=1 +ENV RDMA_CORE_HOME=/opt/rdma-core/build +ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64/:$LD_LIBRARY_PATH +RUN --mount=type=bind,source=docker/patches/deepep.patch,target=/opt/deepep.patch \ + if [ "$INSTALL_DEEPEP" = "True" ]; then \ + # Upgrade system rdma-core to v60; libibverbs-dev supplies the unversioned + # libibverbs.so symlink required at link time. + apt-get update && apt-get install -y --allow-change-held-packages \ + rdma-core libibverbs-dev && \ + apt-get clean && \ + ARCH_LIB=$(dpkg-architecture -qDEB_HOST_MULTIARCH) && \ + # libmlx5-dev is not available in the apt sources; create the unversioned + # symlink from the versioned one as a fallback for the DeepEP build. + test -f /usr/lib/${ARCH_LIB}/libmlx5.so || \ + ln -sf /usr/lib/${ARCH_LIB}/libmlx5.so.1 /usr/lib/${ARCH_LIB}/libmlx5.so && \ + # Expose the system install under RDMA_CORE_HOME so DeepEP setup.py finds + # $RDMA_CORE_HOME/include and $RDMA_CORE_HOME/lib without a separate source build. + mkdir -p ${RDMA_CORE_HOME} && \ + ln -sfn /usr/include ${RDMA_CORE_HOME}/include && \ + ln -sfn /usr/lib/${ARCH_LIB} ${RDMA_CORE_HOME}/lib && \ + git clone https://github.com/deepseek-ai/DeepEP.git && \ + pushd DeepEP && \ + git pull && \ + git fetch origin $DEEPEP_COMMIT && \ + git checkout FETCH_HEAD && \ + patch -p1 < /opt/deepep.patch && \ + pip install --no-cache-dir nvidia-nvshmem-cu13==3.4.5 && \ + # Create symlink for libnvshmem_host.so (required for DeepEP fabric handle operations) + NVSHMEM_LIB_PATH=$(pip show nvidia-nvshmem-cu13 | grep "Location:" | cut -d' ' -f2)/nvidia/nvshmem/lib && \ + ln -sf ${NVSHMEM_LIB_PATH}/libnvshmem_host.so.3 ${NVSHMEM_LIB_PATH}/libnvshmem_host.so && \ + apt-get update && \ + apt-get install -y --no-install-recommends libnvidia-ml-dev && \ + TORCH_CUDA_ARCH_LIST="9.0 10.0 12.0" pip install --no-cache-dir --no-build-isolation -v . && \ + apt-get purge -y libnvidia-ml-dev && \ + apt-get autoremove -y && \ + rm -rf /var/lib/apt/lists/* && \ + popd; \ + fi + ############################################################################## ## ## Install vLLM diff --git a/docker/README.md b/docker/README.md index 9dd225b8ef..686eb0d4b7 100644 --- a/docker/README.md +++ b/docker/README.md @@ -123,6 +123,8 @@ docker build \ | `NCCL_VER` | NCCL version for the TRT-LLM install scripts | | `CUBLAS_VER` | cuBLAS version for the TRT-LLM install scripts | | `NVRTC_VER` | NVRTC version for the TRT-LLM install scripts | +| `INSTALL_DEEPEP` | Set to `True` to build and install DeepEP and nvshmem | +| `DEEPEP_COMMIT` | DeepEP git commit SHA | | `REINSTALL_NSYS` | Set to `True` to reinstall Nsight Systems from the NVIDIA apt repo | | `NSYS_VERSION` | Nsight Systems version (e.g. `2026.1.0.1085`) | | `REINSTALL_CUDNN` | Set to `True` to reinstall cuDNN from the NVIDIA apt repo | @@ -137,8 +139,6 @@ docker build \ | Argument | Description | |---|---| | `BASE_IMAGE` | Base container; set to the fw-base image when building the full stack | -| `INSTALL_DEEPEP` | Set to `True` to build and install DeepEP and nvshmem | -| `DEEPEP_COMMIT` | DeepEP git commit SHA | | `MCORE_TRIGGERED_TESTING` | Skip uv lockfile check for cross-version Megatron-LM testing | | `UV_CACHE_PRUNE_ARGS` | Extra arguments for `uv cache prune` | @@ -163,5 +163,5 @@ docker build \ | `common/install_nccl.sh` | Reinstall NCCL from the public NVIDIA CUDA apt repo | | `common/install_cudnn.sh` | Reinstall cuDNN from the public NVIDIA CUDA apt repo | | `common/install_nsys.sh` | Reinstall Nsight Systems from the public NVIDIA CUDA apt repo | -| `patches/deepep.patch` | Patch applied to DeepEP during CI image build | +| `patches/deepep.patch` | Patch applied to DeepEP during fw-base build | | `patches/vllm.patch` | Patch applied to vLLM after install in fw-base |