diff --git a/docs/conf.py b/docs/conf.py index fa908a2..7b1b4a8 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -1,10 +1,13 @@ -# Configuration file for the Sphinx documentation builder. -# -# This file only contains a selection of the most common options. For a full -# list see the documentation: -# https://www.sphinx-doc.org/en/master/usage/configuration.html +"""Configuration file for the Sphinx documentation builder.""" +import os +import re +from pathlib import Path + +html_baseurl = os.environ.get("READTHEDOCS_CANONICAL_URL", "instinct.docs.amd.com") +html_context = {} +if os.environ.get("READTHEDOCS", "") == "True": + html_context["READTHEDOCS"] = True -# configurations for PDF output by Read the Docs project = "AMD Instinct Hub" html_title = "GPU cluster networking documentation" author = "Advanced Micro Devices, Inc." @@ -19,11 +22,235 @@ html_theme = "rocm_docs_theme" html_theme_options = { - "flavor": "instinct" + "flavor": "instinct", + "link_main_doc": True, + "use_download_button": True, } extensions = ["rocm_docs"] html_static_path = ['_static'] +html_extra_path = ["llms.txt"] + +EXCLUDED_DIRS = { + "_build", + "_templates", + "_static", + ".git", + ".venv", +} + +MARKUP_PREFIXES = ( + ":::", + "```{", + "```", + ":img-top:", + ":class", + ":link:", + ":link-type:", + ":shadow:", + ":columns:", + ":padding:", + ":gutter:", + ":open:", + ":name:", + ":header-rows:", + ":alt:", + "+++", + "-->", + "{bdg-", +) + +# Matches lines like "align: center", "alt:", "name: foo" (directive options +# not starting with a colon, common in MyST figure/table fences) +_BARE_DIRECTIVE_RE = re.compile(r"^[a-z][a-z_-]*:\s*\S*$") + +# Matches MyST/RST anchor labels like "(some-label)=" +_ANCHOR_LABEL_RE = re.compile(r"^\(\w[\w-]*\)=$") + +# Matches RST section underlines (e.g. "====", "----", "~~~~") +_RST_UNDERLINE_RE = re.compile(r"^[=\-~^\"\'#*+]{3,}$") + +# Matches RST code block directives (e.g. ".. code-block:: cpp", ".. code:: sh") +_RST_CODE_BLOCK_RE = re.compile(r"^\.\.\s+(code-block|code|sourcecode)::") + +# Matches markdown table separator rows (e.g. "|---|---|", "| :--- | ---: |"). +_MD_TABLE_SEP_RE = re.compile(r"^\|[\s|:\-]+\|$") + +# Matches RST directives whose indented body should be discarded (e.g. raw HTML). +_RST_SKIP_BLOCK_RE = re.compile(r"^\.\.\s+raw::") + +# Matches HTML tags (e.g. "
", "

", " block + in_html_open_tag = False # inside a multi-line HTML opening tag + kept = [] + for line in lines: + stripped = line.strip() + # Backtick fences (MyST/Markdown) + if stripped.startswith("```"): + in_backtick_fence = not in_backtick_fence + kept.append(line) + continue + if in_backtick_fence: + kept.append(line) + continue + # HTML comment block (): discard all content until --> + if in_html_comment: + if "-->" in stripped: + in_html_comment = False + continue + # RST skip block (e.g. .. raw::): discard all indented content + if in_rst_skip_block: + if not stripped or line[0] in (" ", "\t"): + continue + in_rst_skip_block = False + # RST code block: exit when a non-blank, non-indented line appears + if in_rst_code_block: + if not stripped or line[0] in (" ", "\t"): + kept.append(line) + continue + in_rst_code_block = False + # RST raw block: enter and discard both the directive and its body + if _RST_SKIP_BLOCK_RE.match(stripped): + in_rst_skip_block = True + continue + # RST code block: enter on directive line (directive itself is dropped) + if _RST_CODE_BLOCK_RE.match(stripped): + in_rst_code_block = True + continue + # HTML comment open (): discard opener and enter state + if stripped.startswith("" not in stripped: + in_html_comment = True + continue + # Multi-line HTML opening tag: skip continuation lines until > + if in_html_open_tag: + if ">" in stripped: + in_html_open_tag = False + continue + # Detect HTML opening tags that wrap across lines (no > on this line) + if _HTML_TAG_RE.match(stripped) and ">" not in stripped: + in_html_open_tag = True + continue + if not stripped: + kept.append(line) + elif is_prose_line(line): + # Strip trailing HTML close tags (e.g. "See the guide.

") + cleaned = _TRAILING_HTML_CLOSE_RE.sub("", line).rstrip() + cleaned_stripped = cleaned.strip() + if not cleaned_stripped: + # Entire line was HTML close tags — keep original (shouldn't + # normally reach here since _is_prose_line filters HTML). + kept.append(line) + elif re.search(r"\w", cleaned_stripped): + # Line has real word content after stripping close tags. + kept.append(cleaned) + # else: only punctuation remains (e.g. bare ".") — discard. + cleaned = "\n".join(kept) + + combined.append(f"\n\n---\n\n# {relative}\n") + combined.append(cleaned.strip()) + + output_file.write_text( + "\n".join(combined) + "\n", + encoding="utf-8", + ) + + def setup(app): - app.add_css_file('css/custom.css') \ No newline at end of file + app.add_css_file('css/custom.css') + app.connect("build-finished", generate_combined_markdown) diff --git a/docs/llms.txt b/docs/llms.txt new file mode 100644 index 0000000..5a12e40 --- /dev/null +++ b/docs/llms.txt @@ -0,0 +1,19 @@ +# AMD GPU Cluster Networking + +> Configure and validate high-performance network infrastructure for AMD Instinct GPU clusters. Covers single-node NIC and PCIe validation, multi-node RDMA/RoCE configuration, ROCm-aware MPI with UCX and libfabric, RCCL collective benchmarking, and multi-node LLM inference load balancing. + +## How to + +- [GPU-enabled MPI](https://instinct.docs.amd.com/projects/gpu-cluster-networking/en/latest/how-to/gpu-enabled-mpi.html): Configure ROCm-aware Open MPI with UCX for InfiniBand and RoCE, set up libfabric for Slingshot, enable UCC for GPU-aware collective operations, and run OSU Micro Benchmarks to validate MPI bandwidth and latency. +- [Single-node network configuration](https://instinct.docs.amd.com/projects/gpu-cluster-networking/en/latest/how-to/single-node-config.html): Validate PCIe link speed and width for GPU-to-NIC connectivity, tune Mellanox and Broadcom NIC settings, run RVS PCIe tests, and benchmark GPU memory bandwidth with TransferBench and ROCm Bandwidth Test. +- [Multi-node network configuration](https://instinct.docs.amd.com/projects/gpu-cluster-networking/en/latest/how-to/multi-node-config.html): Run OFED perftest benchmarks for host-to-host and device-to-device RDMA, validate multi-threaded throughput, configure RCCL environment variables for multi-node collective operations, and benchmark with OSU Micro Benchmarks. +- [Multi-node inference with load balancing](https://instinct.docs.amd.com/projects/gpu-cluster-networking/en/latest/how-to/multi-node-inference-lb.html): Deploy multi-node LLM inference clusters using vLLM or SGLang, configure LiteLLM or Nginx as load-balancing gateways, set up Prometheus and Grafana for monitoring, and run load tests with k6 or Apache Bench. +- [RoCE network configuration](https://instinct.docs.amd.com/projects/gpu-cluster-networking/en/latest/how-to/roce-network-config.html): Configure RoCE NICs and switches for GPU cluster networks, prevent ARP flux and routing isolation issues, and apply recommended NIC and switch settings for RDMA over Converged Ethernet. +- [Troubleshooting](https://instinct.docs.amd.com/projects/gpu-cluster-networking/en/latest/how-to/troubleshooting.html): Diagnose RCCL and RDMA errors using error reference tables, resolve network connectivity, firewall, ACS, NUMA balancing, and LD_LIBRARY_PATH issues, and fix MPI interface exclusion and BIOS misconfiguration problems. + +## Reference + +- [Hardware support](https://instinct.docs.amd.com/projects/gpu-cluster-networking/en/latest/reference/hardware-support.html): GPU architecture ROCm version requirements (MI355X/MI350X require ROCm 7.0.1+, MI325X requires 6.3.1+) and NIC compatibility matrices per AMD Instinct GPU generation. +- [Cluster design](https://instinct.docs.amd.com/projects/gpu-cluster-networking/en/latest/reference/cluster-design.html): Reference cluster network design guides and topology PDFs for AMD Instinct GPU deployments ranging from 128 to 8192 GPUs using leaf-spine architectures. + +---