diff --git a/.gitmodules b/.gitmodules index e2cc60ff..876aa3d3 100644 --- a/.gitmodules +++ b/.gitmodules @@ -19,3 +19,6 @@ path = _repos/triton-ascend url = https://gitcode.com/Ascend/triton-ascend.git branch = main +[submodule "_repos/deepspeed"] + path = _repos/deepspeed + url = https://github.com/deepspeedai/DeepSpeed.git diff --git a/Makefile b/Makefile index 2e550ed4..3e4a3c22 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,8 @@ # You can set these variables from the command line, and also # from the environment for the first two. -SPHINXOPTS ?= +# Default -j 1: parallel sphinx workers multiply RSS; cgroup limits (e.g. 2Gi) often OOM-kill (exit 137) without this. +SPHINXOPTS ?= -j 1 SPHINXBUILD ?= sphinx-build SOURCEDIR = . BUILDDIR = _build @@ -15,7 +16,8 @@ PROJECT_CONFIGS = \ _repos/LLaMA-Factory/docs:sources/LLaMA-Factory \ _repos/ms-swift/docs:sources/ms-swift \ _repos/vllm-ascend/docs/source:sources/vllm-ascend \ - _repos/triton-ascend/docs/zh:sources/triton-ascend + _repos/triton-ascend/docs/zh:sources/triton-ascend \ + _repos/deepspeed/docs/_tutorials/accelerator-setup-guide.md:sources/deepspeed/quick_start.md \ # Configure all subprojects generated path GENERATED_DOCS := sources/_generated @@ -63,7 +65,15 @@ sync-onnxruntime-doc: # Initialize submodules (always run to handle empty dirs left by git clone) init-submodules: @git submodule sync --recursive - @git submodule update --init --remote + @n=0; \ + while [ $$n -lt 3 ]; do \ + git submodule update --init --remote && exit 0; \ + n=$$((n+1)); \ + echo "git submodule update failed (attempt $$n/3), retrying in 8s..."; \ + sleep 8; \ + done; \ + echo "git submodule update failed after 3 attempts"; \ + exit 1 # Copy documentation from submodules copy-docs: init-submodules @@ -79,14 +89,24 @@ copy-docs: init-submodules rel_dst=$$(echo $$config | cut -d: -f2); \ dst="$(GENERATED_DOCS)/$$rel_dst"; \ echo "Copying $$src -> $$dst"; \ - rm -rf $$dst; \ - mkdir -p $$dst; \ - echo "Copying $$src to $$dst"; \ - cp -r "$$src"/* "$$dst"/ 2>/dev/null || echo " [WARN] Source directory does not exist or is empty: $$src"; \ + rm -rf "$$dst"; \ + if [ -f "$$src" ]; then \ + mkdir -p "$$(dirname "$$dst")"; \ + echo "Copying $$src to $$dst"; \ + cp "$$src" "$$dst" || echo " [WARN] Source file missing or copy failed: $$src"; \ + elif [ -d "$$src" ]; then \ + mkdir -p "$$dst"; \ + echo "Copying $$src to $$dst"; \ + cp -r "$$src"/* "$$dst"/ 2>/dev/null || echo " [WARN] Source directory does not exist or is empty: $$src"; \ + else \ + echo " [WARN] Source does not exist: $$src"; \ + fi; \ if [ "$$rel_dst" = "sources/vllm-ascend" ] || [ "$$rel_dst" = "sources/triton-ascend" ]; then \ rm -f "$$dst/index.md" "$$dst/index.rst" "$$dst/index.html" 2>/dev/null || true; \ else \ - find "$$dst" -name 'index.*' -delete 2>/dev/null || true; \ + if [ -d "$$dst" ]; then \ + find "$$dst" -name 'index.*' -delete 2>/dev/null || true; \ + fi; \ fi; \ done @@ -102,4 +122,4 @@ html dirhtml singlehtml latex pdf: fetch-config copy-docs sync-onnxruntime-doc # Catch-all target for other Sphinx targets (clean, help, etc.) %: Makefile - @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) \ No newline at end of file + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/_repos/deepspeed b/_repos/deepspeed new file mode 160000 index 00000000..dc0fd295 --- /dev/null +++ b/_repos/deepspeed @@ -0,0 +1 @@ +Subproject commit dc0fd2950b4cb0234ca36fa936e8d2a659b3ca04 diff --git a/conf.py b/conf.py index 0adb6969..0ca0138a 100644 --- a/conf.py +++ b/conf.py @@ -71,7 +71,9 @@ # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. -exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store', '.venv', 'README.md'] +# _repos: submodule working trees duplicate content already copied to sources/_generated; +# indexing both roughly doubles RSS and OOM-kills low-memory cgroup builds (e.g. 2Gi). +exclude_patterns = ['_build', '_repos', 'Thumbs.db', '.DS_Store', '.venv', 'README.md'] # -- Options for HTML output ------------------------------------------------- diff --git a/index.rst b/index.rst index fffc707a..7b2129c7 100644 --- a/index.rst +++ b/index.rst @@ -130,9 +130,9 @@

DeepSpeed

-

分布式训练优化库,V0.10.1 版本起支持昇腾。

- +

DeepSpeed is a deep learning optimization library that makes distributed training and inference easy, efficient, and effective.

+
diff --git a/sources/deepspeed/index.rst b/sources/deepspeed/index.rst index 29bff2a2..5f8dc2e7 100644 --- a/sources/deepspeed/index.rst +++ b/sources/deepspeed/index.rst @@ -4,5 +4,4 @@ DeepSpeed .. toctree:: :maxdepth: 2 - install.rst - quick_start.rst + ../_generated/sources/deepspeed/quick_start diff --git a/sources/deepspeed/install.rst b/sources/deepspeed/install.rst deleted file mode 100644 index 54ed9b08..00000000 --- a/sources/deepspeed/install.rst +++ /dev/null @@ -1,73 +0,0 @@ -安装指南 -============== - -.. note:: - 在本示例之前,请确保已经安装了 `昇腾环境 <../ascend/quick_install.html>`_ 和 `PyTorch <../pytorch/install.html>`_ 环境。 - -1. 安装DeepSpeed ------------------ -安装DeepSpeed最简单的方式是通过 ``pip`` 。 - -.. code-block:: shell - :linenos: - - pip install deepspeed - - -2. 通过源码安装 ------------------- -从 `GitHub `_ 克隆DeepSpeed项目后,可以通过 ``pip`` 来通过源码编译。 - -.. code-block:: shell - :linenos: - - pip install . - - -3. 预编译DeepSpeed算子(可选) ----------------------------------- -如果不想使用JIT编译模式,而想要预编译DeepSpeed算子,可以通过设置环境变量的方式完成算子的预编译。 - -.. code-block:: shell - :linenos: - - DS_BUILD_OPS=1 pip install deepspeed - -4. 安装验证 ------------ - -安装完成后,可以通过 ``ds_report`` 命令查看安装结果 - -.. code-block:: shell - :linenos: - - -------------------------------------------------- - DeepSpeed C++/CUDA extension op report - -------------------------------------------------- - NOTE: Ops not installed will be just-in-time (JIT) compiled at - runtime if needed. Op compatibility means that your system - meet the required dependencies to JIT install the op. - -------------------------------------------------- - JIT compiled ops requires ninja - ninja .................. [OKAY] - -------------------------------------------------- - op name ................ installed .. compatible - -------------------------------------------------- - deepspeed_not_implemented [NO] ....... [OKAY] - async_io ............... [NO] ....... [OKAY] - cpu_adagrad ............ [NO] ....... [OKAY] - cpu_adam ............... [NO] ....... [OKAY] - cpu_lion ............... [NO] ....... [OKAY] - fused_adam ............. [NO] ....... [OKAY] - transformer_inference .. [NO] ....... [OKAY] - -------------------------------------------------- - DeepSpeed general environment info: - torch install path ............... ['/root/miniconda3/envs/ds/lib/python3.10/site-packages/torch'] - torch version .................... 2.2.0 - deepspeed install path ........... ['/root/miniconda3/envs/ds/lib/python3.10/site-packages/deepspeed'] - deepspeed info ................... 0.14.4, unknown, unknown - deepspeed wheel compiled w. ...... torch 2.2 - torch_npu install path ........... ['/root/miniconda3/envs/ds/lib/python3.10/site-packages/torch_npu'] - torch_npu version ................ 2.2.0 - ascend_cann version .............. 8.0.RC2.alpha002 - shared memory (/dev/shm) size .... 20.00 GB diff --git a/sources/deepspeed/quick_start.rst b/sources/deepspeed/quick_start.rst deleted file mode 100644 index 00baae35..00000000 --- a/sources/deepspeed/quick_start.rst +++ /dev/null @@ -1,34 +0,0 @@ -快速开始 -========== - -.. note:: - 在本示例之前,请确保已经安装了 `DeepSpeed <./install.html>`_ 环境。 如果还未安装,可以执行 ``pip install deepspeed`` 完成安装。 - - -1. 使用DeepSpeed多卡并行训练 -------------------------------- -以下代码使用了cifar10数据集,使用DeepSpeed训练模型在多张NPU卡上进行模型训练(来自 `DeepSpeed Examples `_),自DeepSpeed v0.12.6之后,代码无需任何修改,即可自动检测NPU并进行训练。 - -.. rli:: https://raw.githubusercontent.com/microsoft/DeepSpeedExamples/master/training/cifar/cifar10_deepspeed.py - :language: python - :linenos: - -2. 训练结果查看 ----------------- -训练完成后,会打印模型对图像识别的结果。 - -.. code-block:: shell - :linenos: - - Finished Training - Accuracy of the network on the 10000 test images: 57 % - Accuracy of plane : 65 % - Accuracy of car : 67 % - Accuracy of bird : 52 % - Accuracy of cat : 34 % - Accuracy of deer : 52 % - Accuracy of dog : 49 % - Accuracy of frog : 59 % - Accuracy of horse : 66 % - Accuracy of ship : 66 % - Accuracy of truck : 56 %