Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 48 additions & 0 deletions docker/Dockerfile.ci
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,54 @@ RUN curl -LsSf https://astral.sh/uv/${UV_VERSION}/install.sh | sh && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*

##############################################################################
##
## Install DeepEP and nvshmem
##
##############################################################################

ARG INSTALL_DEEPEP=True
ARG DEEPEP_COMMIT=34152ae28f80bcc3ee38d7a12cb2ad87cfd4ea72

ENV HYBRID_EP_MULTINODE=1
ENV RDMA_CORE_HOME=/opt/rdma-core/build
ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64/:$LD_LIBRARY_PATH
RUN --mount=type=bind,source=docker/patches/deepep.patch,target=/opt/deepep.patch \
if [ "$INSTALL_DEEPEP" = "True" ]; then \
# Upgrade system rdma-core to v60; libibverbs-dev supplies the unversioned
# libibverbs.so symlink required at link time.
apt-get update && apt-get install -y --allow-change-held-packages \
rdma-core libibverbs-dev && \
apt-get clean && \
ARCH_LIB=$(dpkg-architecture -qDEB_HOST_MULTIARCH) && \
# libmlx5-dev is not available in the apt sources; create the unversioned
# symlink from the versioned one as a fallback for the DeepEP build.
test -f /usr/lib/${ARCH_LIB}/libmlx5.so || \
ln -sf /usr/lib/${ARCH_LIB}/libmlx5.so.1 /usr/lib/${ARCH_LIB}/libmlx5.so && \
# Expose the system install under RDMA_CORE_HOME so DeepEP setup.py finds
# $RDMA_CORE_HOME/include and $RDMA_CORE_HOME/lib without a separate source build.
mkdir -p ${RDMA_CORE_HOME} && \
ln -sfn /usr/include ${RDMA_CORE_HOME}/include && \
ln -sfn /usr/lib/${ARCH_LIB} ${RDMA_CORE_HOME}/lib && \
git clone https://github.com/deepseek-ai/DeepEP.git && \
pushd DeepEP && \
git pull && \
git fetch origin $DEEPEP_COMMIT && \
git checkout FETCH_HEAD && \
patch -p1 < /opt/deepep.patch && \
pip install --no-cache-dir nvidia-nvshmem-cu13==3.4.5 && \
# Create symlink for libnvshmem_host.so (required for DeepEP fabric handle operations)
NVSHMEM_LIB_PATH=$(pip show nvidia-nvshmem-cu13 | grep "Location:" | cut -d' ' -f2)/nvidia/nvshmem/lib && \
ln -sf ${NVSHMEM_LIB_PATH}/libnvshmem_host.so.3 ${NVSHMEM_LIB_PATH}/libnvshmem_host.so && \
apt-get update && \
apt-get install -y --no-install-recommends libnvidia-ml-dev && \
TORCH_CUDA_ARCH_LIST="9.0 10.0 12.0" pip install --no-cache-dir --no-build-isolation -v . && \
apt-get purge -y libnvidia-ml-dev && \
apt-get autoremove -y && \
rm -rf /var/lib/apt/lists/* && \
popd; \
fi

COPY pyproject.toml uv.lock /opt/Megatron-Bridge/
COPY src/megatron/bridge/__init__.py src/megatron/bridge/package_info.py /opt/Megatron-Bridge/src/megatron/bridge/
COPY 3rdparty/Megatron-LM/pyproject.toml 3rdparty/Megatron-LM/setup.py /opt/Megatron-Bridge/3rdparty/Megatron-LM/
Expand Down
48 changes: 0 additions & 48 deletions docker/Dockerfile.fw_base
Original file line number Diff line number Diff line change
Expand Up @@ -122,54 +122,6 @@ RUN --mount=type=bind,source=docker/common/install_nsys.sh,target=/opt/install_n
bash /opt/install_cublas.sh --CUBLAS_VER=$CUBLAS_VERSION; \
fi

##############################################################################
##
## Install DeepEP and nvshmem
##
##############################################################################

ARG INSTALL_DEEPEP
ARG DEEPEP_COMMIT

ENV HYBRID_EP_MULTINODE=1
ENV RDMA_CORE_HOME=/opt/rdma-core/build
ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64/:$LD_LIBRARY_PATH
RUN --mount=type=bind,source=docker/patches/deepep.patch,target=/opt/deepep.patch \
if [ "$INSTALL_DEEPEP" = "True" ]; then \
# Upgrade system rdma-core to v60; libibverbs-dev supplies the unversioned
# libibverbs.so symlink required at link time.
apt-get update && apt-get install -y --allow-change-held-packages \
rdma-core libibverbs-dev && \
apt-get clean && \
ARCH_LIB=$(dpkg-architecture -qDEB_HOST_MULTIARCH) && \
# libmlx5-dev is not available in the apt sources; create the unversioned
# symlink from the versioned one as a fallback for the DeepEP build.
test -f /usr/lib/${ARCH_LIB}/libmlx5.so || \
ln -sf /usr/lib/${ARCH_LIB}/libmlx5.so.1 /usr/lib/${ARCH_LIB}/libmlx5.so && \
# Expose the system install under RDMA_CORE_HOME so DeepEP setup.py finds
# $RDMA_CORE_HOME/include and $RDMA_CORE_HOME/lib without a separate source build.
mkdir -p ${RDMA_CORE_HOME} && \
ln -sfn /usr/include ${RDMA_CORE_HOME}/include && \
ln -sfn /usr/lib/${ARCH_LIB} ${RDMA_CORE_HOME}/lib && \
git clone https://github.com/deepseek-ai/DeepEP.git && \
pushd DeepEP && \
git pull && \
git fetch origin $DEEPEP_COMMIT && \
git checkout FETCH_HEAD && \
patch -p1 < /opt/deepep.patch && \
pip install --no-cache-dir nvidia-nvshmem-cu13==3.4.5 && \
# Create symlink for libnvshmem_host.so (required for DeepEP fabric handle operations)
NVSHMEM_LIB_PATH=$(pip show nvidia-nvshmem-cu13 | grep "Location:" | cut -d' ' -f2)/nvidia/nvshmem/lib && \
ln -sf ${NVSHMEM_LIB_PATH}/libnvshmem_host.so.3 ${NVSHMEM_LIB_PATH}/libnvshmem_host.so && \
apt-get update && \
apt-get install -y --no-install-recommends libnvidia-ml-dev && \
TORCH_CUDA_ARCH_LIST="9.0 10.0 12.0" pip install --no-cache-dir --no-build-isolation -v . && \
apt-get purge -y libnvidia-ml-dev && \
apt-get autoremove -y && \
rm -rf /var/lib/apt/lists/* && \
popd; \
fi

##############################################################################
##
## Install vLLM
Expand Down
6 changes: 3 additions & 3 deletions docker/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -123,8 +123,6 @@ docker build \
| `NCCL_VER` | NCCL version for the TRT-LLM install scripts |
| `CUBLAS_VER` | cuBLAS version for the TRT-LLM install scripts |
| `NVRTC_VER` | NVRTC version for the TRT-LLM install scripts |
| `INSTALL_DEEPEP` | Set to `True` to build and install DeepEP and nvshmem |
| `DEEPEP_COMMIT` | DeepEP git commit SHA |
| `REINSTALL_NSYS` | Set to `True` to reinstall Nsight Systems from the NVIDIA apt repo |
| `NSYS_VERSION` | Nsight Systems version (e.g. `2026.1.0.1085`) |
| `REINSTALL_CUDNN` | Set to `True` to reinstall cuDNN from the NVIDIA apt repo |
Expand All @@ -139,6 +137,8 @@ docker build \
| Argument | Description |
|---|---|
| `BASE_IMAGE` | Base container; set to the fw-base image when building the full stack |
| `INSTALL_DEEPEP` | Set to `True` to build and install DeepEP and nvshmem |
| `DEEPEP_COMMIT` | DeepEP git commit SHA |
| `MCORE_TRIGGERED_TESTING` | Skip uv lockfile check for cross-version Megatron-LM testing |
| `UV_CACHE_PRUNE_ARGS` | Extra arguments for `uv cache prune` |

Expand All @@ -163,5 +163,5 @@ docker build \
| `common/install_nccl.sh` | Reinstall NCCL from the public NVIDIA CUDA apt repo |
| `common/install_cudnn.sh` | Reinstall cuDNN from the public NVIDIA CUDA apt repo |
| `common/install_nsys.sh` | Reinstall Nsight Systems from the public NVIDIA CUDA apt repo |
| `patches/deepep.patch` | Patch applied to DeepEP during fw-base build |
| `patches/deepep.patch` | Patch applied to DeepEP during CI image build |
| `patches/vllm.patch` | Patch applied to vLLM after install in fw-base |
Loading