Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
243 changes: 235 additions & 8 deletions docs/conf.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
# Configuration file for the Sphinx documentation builder.
#
# This file only contains a selection of the most common options. For a full
# list see the documentation:
# https://www.sphinx-doc.org/en/master/usage/configuration.html
"""Configuration file for the Sphinx documentation builder."""
import os
import re
from pathlib import Path

html_baseurl = os.environ.get("READTHEDOCS_CANONICAL_URL", "instinct.docs.amd.com")
html_context = {}
if os.environ.get("READTHEDOCS", "") == "True":
html_context["READTHEDOCS"] = True

# configurations for PDF output by Read the Docs
project = "AMD Instinct Hub"
html_title = "GPU cluster networking documentation"
author = "Advanced Micro Devices, Inc."
Expand All @@ -19,11 +22,235 @@

html_theme = "rocm_docs_theme"
html_theme_options = {
"flavor": "instinct"
"flavor": "instinct",
"link_main_doc": True,
"use_download_button": True,
}
extensions = ["rocm_docs"]

html_static_path = ['_static']

html_extra_path = ["llms.txt"]

EXCLUDED_DIRS = {
"_build",
"_templates",
"_static",
".git",
".venv",
}

MARKUP_PREFIXES = (
":::",
"```{",
"```",
":img-top:",
":class",
":link:",
":link-type:",
":shadow:",
":columns:",
":padding:",
":gutter:",
":open:",
":name:",
":header-rows:",
":alt:",
"+++",
"-->",
"{bdg-",
)

# Matches lines like "align: center", "alt:", "name: foo" (directive options
# not starting with a colon, common in MyST figure/table fences)
_BARE_DIRECTIVE_RE = re.compile(r"^[a-z][a-z_-]*:\s*\S*$")

# Matches MyST/RST anchor labels like "(some-label)="
_ANCHOR_LABEL_RE = re.compile(r"^\(\w[\w-]*\)=$")

# Matches RST section underlines (e.g. "====", "----", "~~~~")
_RST_UNDERLINE_RE = re.compile(r"^[=\-~^\"\'#*+]{3,}$")

# Matches RST code block directives (e.g. ".. code-block:: cpp", ".. code:: sh")
_RST_CODE_BLOCK_RE = re.compile(r"^\.\.\s+(code-block|code|sourcecode)::")

# Matches markdown table separator rows (e.g. "|---|---|", "| :--- | ---: |").
_MD_TABLE_SEP_RE = re.compile(r"^\|[\s|:\-]+\|$")

# Matches RST directives whose indented body should be discarded (e.g. raw HTML).
_RST_SKIP_BLOCK_RE = re.compile(r"^\.\.\s+raw::")

# Matches HTML tags (e.g. "<div>", "</p>", "<!--") but NOT RST hyperlink URL
# continuation lines (e.g. "<https://...>`_"). The negative lookahead excludes
# URL schemes so that multi-line RST inline hyperlinks are preserved.
_HTML_TAG_RE = re.compile(r"^<(?!https?://|ftp://|mailto:)[a-zA-Z/!]")

# Matches trailing HTML close tags at the end of a prose line
# (e.g. "Browse blogs.</p>", "See the guide.</li></ul>").
_TRAILING_HTML_CLOSE_RE = re.compile(r"(</[a-zA-Z]+>)+\s*$")

MIN_PROSE_LINES = 10


def should_skip(path: Path) -> bool:
return any(part in EXCLUDED_DIRS for part in path.parts)


def is_prose_line(line: str) -> bool:
stripped = line.strip()
if not stripped:
return False
if stripped.startswith(MARKUP_PREFIXES):
return False
# Drop bare directive-option lines (e.g. "align: center", "alt:")
if _BARE_DIRECTIVE_RE.match(stripped):
return False
# Drop MyST/RST anchor labels (e.g. "(some-label)=")
if _ANCHOR_LABEL_RE.match(stripped):
return False
# Drop markdown table separator rows (e.g. "|---|---|", "| :--- | ---: |")
if _MD_TABLE_SEP_RE.match(stripped):
return False
# Drop HTML tags (e.g. "<div>", "</p>") but keep RST hyperlink URL
# continuation lines (e.g. "<https://rocm.docs.amd.com/...>`_")
if _HTML_TAG_RE.match(stripped):
return False
# Drop RST directives, comments, hyperlink targets, and substitution definitions
if stripped.startswith(".."):
return False
# Drop YAML frontmatter key-value pairs (e.g. "description lang=en": "text")
if stripped.startswith('"') and re.match(r'^"[^"]+"\s*:', stripped):
return False
# Drop RST field list items (e.g. ":type: int") and extended RST meta
# options (e.g. ":description lang=en: text"). Excludes inline roles at line
# start (e.g. ":cpp:func:`hipMalloc` returns..." or ":ref:`foo <bar>` describes...")
# because those are followed by a backtick, not a space or end-of-line.
if re.match(r"^:[A-Za-z][A-Za-z0-9_ =-]*:(\s|$)", stripped):
return False
# Drop RST section underlines (e.g. "====", "----", "~~~~")
if _RST_UNDERLINE_RE.match(stripped):
return False
return True


def generate_combined_markdown(app, exception):
if exception:
return

docs_root = Path(app.srcdir)
output_file = Path(app.outdir) / "llms-full.txt"
base_file = docs_root / "llms.txt"

combined = []

if base_file.exists():
base_text = base_file.read_text(encoding="utf-8").rstrip().rstrip("-").rstrip()
combined.append(base_text)
else:
combined.append("# AMD GPU Cluster Networking")

all_files = sorted(
list(docs_root.rglob("*.md")) + list(docs_root.rglob("*.rst"))
)

for doc_file in all_files:
if should_skip(doc_file):
continue

if doc_file == base_file:
continue

try:
content = doc_file.read_text(encoding="utf-8")
except Exception:
continue

lines = content.splitlines()
prose_lines = [line for line in lines if is_prose_line(line)]

if len(prose_lines) < MIN_PROSE_LINES:
continue

relative = doc_file.relative_to(docs_root)
in_backtick_fence = False
in_rst_code_block = False
in_rst_skip_block = False
in_html_comment = False # inside <!-- ... --> block
in_html_open_tag = False # inside a multi-line HTML opening tag
kept = []
for line in lines:
stripped = line.strip()
# Backtick fences (MyST/Markdown)
if stripped.startswith("```"):
in_backtick_fence = not in_backtick_fence
kept.append(line)
continue
if in_backtick_fence:
kept.append(line)
continue
# HTML comment block (<!-- ... -->): discard all content until -->
if in_html_comment:
if "-->" in stripped:
in_html_comment = False
continue
# RST skip block (e.g. .. raw::): discard all indented content
if in_rst_skip_block:
if not stripped or line[0] in (" ", "\t"):
continue
in_rst_skip_block = False
# RST code block: exit when a non-blank, non-indented line appears
if in_rst_code_block:
if not stripped or line[0] in (" ", "\t"):
kept.append(line)
continue
in_rst_code_block = False
# RST raw block: enter and discard both the directive and its body
if _RST_SKIP_BLOCK_RE.match(stripped):
in_rst_skip_block = True
continue
# RST code block: enter on directive line (directive itself is dropped)
if _RST_CODE_BLOCK_RE.match(stripped):
in_rst_code_block = True
continue
# HTML comment open (<!-- ... -->): discard opener and enter state
if stripped.startswith("<!--"):
if "-->" not in stripped:
in_html_comment = True
continue
# Multi-line HTML opening tag: skip continuation lines until >
if in_html_open_tag:
if ">" in stripped:
in_html_open_tag = False
continue
# Detect HTML opening tags that wrap across lines (no > on this line)
if _HTML_TAG_RE.match(stripped) and ">" not in stripped:
in_html_open_tag = True
continue
if not stripped:
kept.append(line)
elif is_prose_line(line):
# Strip trailing HTML close tags (e.g. "See the guide.</p>")
cleaned = _TRAILING_HTML_CLOSE_RE.sub("", line).rstrip()
cleaned_stripped = cleaned.strip()
if not cleaned_stripped:
# Entire line was HTML close tags — keep original (shouldn't
# normally reach here since _is_prose_line filters HTML).
kept.append(line)
elif re.search(r"\w", cleaned_stripped):
# Line has real word content after stripping close tags.
kept.append(cleaned)
# else: only punctuation remains (e.g. bare ".") — discard.
cleaned = "\n".join(kept)

combined.append(f"\n\n---\n\n# {relative}\n")
combined.append(cleaned.strip())

output_file.write_text(
"\n".join(combined) + "\n",
encoding="utf-8",
)


def setup(app):
app.add_css_file('css/custom.css')
app.add_css_file('css/custom.css')
app.connect("build-finished", generate_combined_markdown)
19 changes: 19 additions & 0 deletions docs/llms.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# AMD GPU Cluster Networking

> Configure and validate high-performance network infrastructure for AMD Instinct GPU clusters. Covers single-node NIC and PCIe validation, multi-node RDMA/RoCE configuration, ROCm-aware MPI with UCX and libfabric, RCCL collective benchmarking, and multi-node LLM inference load balancing.

## How to

- [GPU-enabled MPI](https://instinct.docs.amd.com/projects/gpu-cluster-networking/en/latest/how-to/gpu-enabled-mpi.html): Configure ROCm-aware Open MPI with UCX for InfiniBand and RoCE, set up libfabric for Slingshot, enable UCC for GPU-aware collective operations, and run OSU Micro Benchmarks to validate MPI bandwidth and latency.
- [Single-node network configuration](https://instinct.docs.amd.com/projects/gpu-cluster-networking/en/latest/how-to/single-node-config.html): Validate PCIe link speed and width for GPU-to-NIC connectivity, tune Mellanox and Broadcom NIC settings, run RVS PCIe tests, and benchmark GPU memory bandwidth with TransferBench and ROCm Bandwidth Test.
- [Multi-node network configuration](https://instinct.docs.amd.com/projects/gpu-cluster-networking/en/latest/how-to/multi-node-config.html): Run OFED perftest benchmarks for host-to-host and device-to-device RDMA, validate multi-threaded throughput, configure RCCL environment variables for multi-node collective operations, and benchmark with OSU Micro Benchmarks.
- [Multi-node inference with load balancing](https://instinct.docs.amd.com/projects/gpu-cluster-networking/en/latest/how-to/multi-node-inference-lb.html): Deploy multi-node LLM inference clusters using vLLM or SGLang, configure LiteLLM or Nginx as load-balancing gateways, set up Prometheus and Grafana for monitoring, and run load tests with k6 or Apache Bench.
- [RoCE network configuration](https://instinct.docs.amd.com/projects/gpu-cluster-networking/en/latest/how-to/roce-network-config.html): Configure RoCE NICs and switches for GPU cluster networks, prevent ARP flux and routing isolation issues, and apply recommended NIC and switch settings for RDMA over Converged Ethernet.
- [Troubleshooting](https://instinct.docs.amd.com/projects/gpu-cluster-networking/en/latest/how-to/troubleshooting.html): Diagnose RCCL and RDMA errors using error reference tables, resolve network connectivity, firewall, ACS, NUMA balancing, and LD_LIBRARY_PATH issues, and fix MPI interface exclusion and BIOS misconfiguration problems.

## Reference

- [Hardware support](https://instinct.docs.amd.com/projects/gpu-cluster-networking/en/latest/reference/hardware-support.html): GPU architecture ROCm version requirements (MI355X/MI350X require ROCm 7.0.1+, MI325X requires 6.3.1+) and NIC compatibility matrices per AMD Instinct GPU generation.
- [Cluster design](https://instinct.docs.amd.com/projects/gpu-cluster-networking/en/latest/reference/cluster-design.html): Reference cluster network design guides and topology PDFs for AMD Instinct GPU deployments ranging from 128 to 8192 GPUs using leaf-spine architectures.

---
Loading