diff --git a/.gitmodules b/.gitmodules index e2cc60ff..876aa3d3 100644 --- a/.gitmodules +++ b/.gitmodules @@ -19,3 +19,6 @@ path = _repos/triton-ascend url = https://gitcode.com/Ascend/triton-ascend.git branch = main +[submodule "_repos/deepspeed"] + path = _repos/deepspeed + url = https://github.com/deepspeedai/DeepSpeed.git diff --git a/Makefile b/Makefile index 2e550ed4..3e4a3c22 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,8 @@ # You can set these variables from the command line, and also # from the environment for the first two. -SPHINXOPTS ?= +# Default -j 1: parallel sphinx workers multiply RSS; cgroup limits (e.g. 2Gi) often OOM-kill (exit 137) without this. +SPHINXOPTS ?= -j 1 SPHINXBUILD ?= sphinx-build SOURCEDIR = . BUILDDIR = _build @@ -15,7 +16,8 @@ PROJECT_CONFIGS = \ _repos/LLaMA-Factory/docs:sources/LLaMA-Factory \ _repos/ms-swift/docs:sources/ms-swift \ _repos/vllm-ascend/docs/source:sources/vllm-ascend \ - _repos/triton-ascend/docs/zh:sources/triton-ascend + _repos/triton-ascend/docs/zh:sources/triton-ascend \ + _repos/deepspeed/docs/_tutorials/accelerator-setup-guide.md:sources/deepspeed/quick_start.md \ # Configure all subprojects generated path GENERATED_DOCS := sources/_generated @@ -63,7 +65,15 @@ sync-onnxruntime-doc: # Initialize submodules (always run to handle empty dirs left by git clone) init-submodules: @git submodule sync --recursive - @git submodule update --init --remote + @n=0; \ + while [ $$n -lt 3 ]; do \ + git submodule update --init --remote && exit 0; \ + n=$$((n+1)); \ + echo "git submodule update failed (attempt $$n/3), retrying in 8s..."; \ + sleep 8; \ + done; \ + echo "git submodule update failed after 3 attempts"; \ + exit 1 # Copy documentation from submodules copy-docs: init-submodules @@ -79,14 +89,24 @@ copy-docs: init-submodules rel_dst=$$(echo $$config | cut -d: -f2); \ dst="$(GENERATED_DOCS)/$$rel_dst"; \ echo "Copying $$src -> $$dst"; \ - rm -rf $$dst; \ - mkdir -p $$dst; \ - echo "Copying $$src to $$dst"; \ - cp -r "$$src"/* "$$dst"/ 2>/dev/null || echo " [WARN] Source directory does not exist or is empty: $$src"; \ + rm -rf "$$dst"; \ + if [ -f "$$src" ]; then \ + mkdir -p "$$(dirname "$$dst")"; \ + echo "Copying $$src to $$dst"; \ + cp "$$src" "$$dst" || echo " [WARN] Source file missing or copy failed: $$src"; \ + elif [ -d "$$src" ]; then \ + mkdir -p "$$dst"; \ + echo "Copying $$src to $$dst"; \ + cp -r "$$src"/* "$$dst"/ 2>/dev/null || echo " [WARN] Source directory does not exist or is empty: $$src"; \ + else \ + echo " [WARN] Source does not exist: $$src"; \ + fi; \ if [ "$$rel_dst" = "sources/vllm-ascend" ] || [ "$$rel_dst" = "sources/triton-ascend" ]; then \ rm -f "$$dst/index.md" "$$dst/index.rst" "$$dst/index.html" 2>/dev/null || true; \ else \ - find "$$dst" -name 'index.*' -delete 2>/dev/null || true; \ + if [ -d "$$dst" ]; then \ + find "$$dst" -name 'index.*' -delete 2>/dev/null || true; \ + fi; \ fi; \ done @@ -102,4 +122,4 @@ html dirhtml singlehtml latex pdf: fetch-config copy-docs sync-onnxruntime-doc # Catch-all target for other Sphinx targets (clean, help, etc.) %: Makefile - @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) \ No newline at end of file + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/_repos/deepspeed b/_repos/deepspeed new file mode 160000 index 00000000..dc0fd295 --- /dev/null +++ b/_repos/deepspeed @@ -0,0 +1 @@ +Subproject commit dc0fd2950b4cb0234ca36fa936e8d2a659b3ca04 diff --git a/conf.py b/conf.py index 0adb6969..0ca0138a 100644 --- a/conf.py +++ b/conf.py @@ -71,7 +71,9 @@ # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. -exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store', '.venv', 'README.md'] +# _repos: submodule working trees duplicate content already copied to sources/_generated; +# indexing both roughly doubles RSS and OOM-kills low-memory cgroup builds (e.g. 2Gi). +exclude_patterns = ['_build', '_repos', 'Thumbs.db', '.DS_Store', '.venv', 'README.md'] # -- Options for HTML output ------------------------------------------------- diff --git a/index.rst b/index.rst index fffc707a..7b2129c7 100644 --- a/index.rst +++ b/index.rst @@ -130,9 +130,9 @@
分布式训练优化库,V0.10.1 版本起支持昇腾。
- +DeepSpeed is a deep learning optimization library that makes distributed training and inference easy, efficient, and effective.
+