diff --git a/docs/conf.py b/docs/conf.py
index fa908a2..7b1b4a8 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -1,10 +1,13 @@
-# Configuration file for the Sphinx documentation builder.
-#
-# This file only contains a selection of the most common options. For a full
-# list see the documentation:
-# https://www.sphinx-doc.org/en/master/usage/configuration.html
+"""Configuration file for the Sphinx documentation builder."""
+import os
+import re
+from pathlib import Path
+
+html_baseurl = os.environ.get("READTHEDOCS_CANONICAL_URL", "instinct.docs.amd.com")
+html_context = {}
+if os.environ.get("READTHEDOCS", "") == "True":
+ html_context["READTHEDOCS"] = True
-# configurations for PDF output by Read the Docs
project = "AMD Instinct Hub"
html_title = "GPU cluster networking documentation"
author = "Advanced Micro Devices, Inc."
@@ -19,11 +22,235 @@
html_theme = "rocm_docs_theme"
html_theme_options = {
- "flavor": "instinct"
+ "flavor": "instinct",
+ "link_main_doc": True,
+ "use_download_button": True,
}
extensions = ["rocm_docs"]
html_static_path = ['_static']
+html_extra_path = ["llms.txt"]
+
+EXCLUDED_DIRS = {
+ "_build",
+ "_templates",
+ "_static",
+ ".git",
+ ".venv",
+}
+
+MARKUP_PREFIXES = (
+ ":::",
+ "```{",
+ "```",
+ ":img-top:",
+ ":class",
+ ":link:",
+ ":link-type:",
+ ":shadow:",
+ ":columns:",
+ ":padding:",
+ ":gutter:",
+ ":open:",
+ ":name:",
+ ":header-rows:",
+ ":alt:",
+ "+++",
+ "-->",
+ "{bdg-",
+)
+
+# Matches lines like "align: center", "alt:", "name: foo" (directive options
+# not starting with a colon, common in MyST figure/table fences)
+_BARE_DIRECTIVE_RE = re.compile(r"^[a-z][a-z_-]*:\s*\S*$")
+
+# Matches MyST/RST anchor labels like "(some-label)="
+_ANCHOR_LABEL_RE = re.compile(r"^\(\w[\w-]*\)=$")
+
+# Matches RST section underlines (e.g. "====", "----", "~~~~")
+_RST_UNDERLINE_RE = re.compile(r"^[=\-~^\"\'#*+]{3,}$")
+
+# Matches RST code block directives (e.g. ".. code-block:: cpp", ".. code:: sh")
+_RST_CODE_BLOCK_RE = re.compile(r"^\.\.\s+(code-block|code|sourcecode)::")
+
+# Matches markdown table separator rows (e.g. "|---|---|", "| :--- | ---: |").
+_MD_TABLE_SEP_RE = re.compile(r"^\|[\s|:\-]+\|$")
+
+# Matches RST directives whose indented body should be discarded (e.g. raw HTML).
+_RST_SKIP_BLOCK_RE = re.compile(r"^\.\.\s+raw::")
+
+# Matches HTML tags (e.g. "
", "", " block
+ in_html_open_tag = False # inside a multi-line HTML opening tag
+ kept = []
+ for line in lines:
+ stripped = line.strip()
+ # Backtick fences (MyST/Markdown)
+ if stripped.startswith("```"):
+ in_backtick_fence = not in_backtick_fence
+ kept.append(line)
+ continue
+ if in_backtick_fence:
+ kept.append(line)
+ continue
+ # HTML comment block (): discard all content until -->
+ if in_html_comment:
+ if "-->" in stripped:
+ in_html_comment = False
+ continue
+ # RST skip block (e.g. .. raw::): discard all indented content
+ if in_rst_skip_block:
+ if not stripped or line[0] in (" ", "\t"):
+ continue
+ in_rst_skip_block = False
+ # RST code block: exit when a non-blank, non-indented line appears
+ if in_rst_code_block:
+ if not stripped or line[0] in (" ", "\t"):
+ kept.append(line)
+ continue
+ in_rst_code_block = False
+ # RST raw block: enter and discard both the directive and its body
+ if _RST_SKIP_BLOCK_RE.match(stripped):
+ in_rst_skip_block = True
+ continue
+ # RST code block: enter on directive line (directive itself is dropped)
+ if _RST_CODE_BLOCK_RE.match(stripped):
+ in_rst_code_block = True
+ continue
+ # HTML comment open (): discard opener and enter state
+ if stripped.startswith("" not in stripped:
+ in_html_comment = True
+ continue
+ # Multi-line HTML opening tag: skip continuation lines until >
+ if in_html_open_tag:
+ if ">" in stripped:
+ in_html_open_tag = False
+ continue
+ # Detect HTML opening tags that wrap across lines (no > on this line)
+ if _HTML_TAG_RE.match(stripped) and ">" not in stripped:
+ in_html_open_tag = True
+ continue
+ if not stripped:
+ kept.append(line)
+ elif is_prose_line(line):
+ # Strip trailing HTML close tags (e.g. "See the guide.")
+ cleaned = _TRAILING_HTML_CLOSE_RE.sub("", line).rstrip()
+ cleaned_stripped = cleaned.strip()
+ if not cleaned_stripped:
+ # Entire line was HTML close tags — keep original (shouldn't
+ # normally reach here since _is_prose_line filters HTML).
+ kept.append(line)
+ elif re.search(r"\w", cleaned_stripped):
+ # Line has real word content after stripping close tags.
+ kept.append(cleaned)
+ # else: only punctuation remains (e.g. bare ".") — discard.
+ cleaned = "\n".join(kept)
+
+ combined.append(f"\n\n---\n\n# {relative}\n")
+ combined.append(cleaned.strip())
+
+ output_file.write_text(
+ "\n".join(combined) + "\n",
+ encoding="utf-8",
+ )
+
+
def setup(app):
- app.add_css_file('css/custom.css')
\ No newline at end of file
+ app.add_css_file('css/custom.css')
+ app.connect("build-finished", generate_combined_markdown)
diff --git a/docs/llms.txt b/docs/llms.txt
new file mode 100644
index 0000000..5a12e40
--- /dev/null
+++ b/docs/llms.txt
@@ -0,0 +1,19 @@
+# AMD GPU Cluster Networking
+
+> Configure and validate high-performance network infrastructure for AMD Instinct GPU clusters. Covers single-node NIC and PCIe validation, multi-node RDMA/RoCE configuration, ROCm-aware MPI with UCX and libfabric, RCCL collective benchmarking, and multi-node LLM inference load balancing.
+
+## How to
+
+- [GPU-enabled MPI](https://instinct.docs.amd.com/projects/gpu-cluster-networking/en/latest/how-to/gpu-enabled-mpi.html): Configure ROCm-aware Open MPI with UCX for InfiniBand and RoCE, set up libfabric for Slingshot, enable UCC for GPU-aware collective operations, and run OSU Micro Benchmarks to validate MPI bandwidth and latency.
+- [Single-node network configuration](https://instinct.docs.amd.com/projects/gpu-cluster-networking/en/latest/how-to/single-node-config.html): Validate PCIe link speed and width for GPU-to-NIC connectivity, tune Mellanox and Broadcom NIC settings, run RVS PCIe tests, and benchmark GPU memory bandwidth with TransferBench and ROCm Bandwidth Test.
+- [Multi-node network configuration](https://instinct.docs.amd.com/projects/gpu-cluster-networking/en/latest/how-to/multi-node-config.html): Run OFED perftest benchmarks for host-to-host and device-to-device RDMA, validate multi-threaded throughput, configure RCCL environment variables for multi-node collective operations, and benchmark with OSU Micro Benchmarks.
+- [Multi-node inference with load balancing](https://instinct.docs.amd.com/projects/gpu-cluster-networking/en/latest/how-to/multi-node-inference-lb.html): Deploy multi-node LLM inference clusters using vLLM or SGLang, configure LiteLLM or Nginx as load-balancing gateways, set up Prometheus and Grafana for monitoring, and run load tests with k6 or Apache Bench.
+- [RoCE network configuration](https://instinct.docs.amd.com/projects/gpu-cluster-networking/en/latest/how-to/roce-network-config.html): Configure RoCE NICs and switches for GPU cluster networks, prevent ARP flux and routing isolation issues, and apply recommended NIC and switch settings for RDMA over Converged Ethernet.
+- [Troubleshooting](https://instinct.docs.amd.com/projects/gpu-cluster-networking/en/latest/how-to/troubleshooting.html): Diagnose RCCL and RDMA errors using error reference tables, resolve network connectivity, firewall, ACS, NUMA balancing, and LD_LIBRARY_PATH issues, and fix MPI interface exclusion and BIOS misconfiguration problems.
+
+## Reference
+
+- [Hardware support](https://instinct.docs.amd.com/projects/gpu-cluster-networking/en/latest/reference/hardware-support.html): GPU architecture ROCm version requirements (MI355X/MI350X require ROCm 7.0.1+, MI325X requires 6.3.1+) and NIC compatibility matrices per AMD Instinct GPU generation.
+- [Cluster design](https://instinct.docs.amd.com/projects/gpu-cluster-networking/en/latest/reference/cluster-design.html): Reference cluster network design guides and topology PDFs for AMD Instinct GPU deployments ranging from 128 to 8192 GPUs using leaf-spine architectures.
+
+---