diff --git a/docs/common/system-setup.md b/docs/common/system-setup.md index d956c97..b5cea53 100644 --- a/docs/common/system-setup.md +++ b/docs/common/system-setup.md @@ -24,5 +24,5 @@ Basic checks (consult ROCm docs for expanded diagnostics): cat /opt/rocm/.info/version # Validate installed version -amd-smi --version +amd-smi version ``` diff --git a/docs/common/system-validation.md b/docs/common/system-validation.md index 1180e01..bf49b1b 100644 --- a/docs/common/system-validation.md +++ b/docs/common/system-validation.md @@ -612,6 +612,11 @@ Pass if bus bandwidth (large message, ~8 GB) ≥ 304 GB/s. ### rocBLAS GEMM Benchmarks +For installation, review rocBLAS documentation: + +- [Linux installation](https://rocm.docs.amd.com/projects/rocBLAS/en/latest/install/Linux_Install_Guide.html) +- [Windows installation](https://rocm.docs.amd.com/projects/rocBLAS/en/latest/install/Windows_Install_Guide.html) + Run each until peak (stable) TFLOPS observed. Capture highest achieved value. FP32: @@ -696,7 +701,7 @@ For complete details, extended guidance, and troubleshooting tips, consult the * ### AGFHC Installation -For AGFHC installation steps consult the AMD GPU Field Health Check (AGFHC) User Guide (UG-58416) on the [AMD Technical Information Portal](https://docs.amd.com/). +Reach out to your AMD customer success team for specific installation steps regarding AGFHC. The ROCm Validation suite (RVS) is a prerequisite of AGFHC. Make sure that this is installed as of the ROCm software installation. For example, on Ubuntu: @@ -832,9 +837,8 @@ The tables below list the recommended and suggested AGFHC validation recipes alo | gfx_lvl4 | All AMD MI3xx Instinct™ models | 1 Hour | GPU stress test to hot spot test GPU needed for DLC systems | | sleep 300 sec. | | 5 Minutes, sixth iteration | For silicon to contract to widen any cracks | | minihpl | All AMD MI3xx Instinct™ models | 3 Hours | Search for voltage failures and stress HBM | -| xgmi_lvl1 | All AMD MI3xx Instinct™ models | 5 Minutes | Check for link degradation | | pcie_lvl2 | All AMD MI3xx Instinct™ models | 10 Minutes | Check for link degradation | -| Total | | | 14 Hours and 45 Minutes | +| Total | | | 14 Hours and 40 Minutes | #### Recommended AGFHC Tests diff --git a/docs/conf.py b/docs/conf.py index faf1c83..713f896 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -2,6 +2,8 @@ import os import subprocess import sys +from pathlib import Path +import shutil html_baseurl = os.environ.get("READTHEDOCS_CANONICAL_URL", "instinct.docs.amd.com") html_context = {} @@ -24,6 +26,7 @@ html_theme_options = { "flavor": "instinct", "link_main_doc": True, + "use_download_button": True, "nav_secondary_items": { "Community": "https://github.com/ROCm/ROCm/discussions", "Blogs": "https://rocm.blogs.amd.com/", @@ -38,5 +41,238 @@ external_toc_path = "./sphinx/_toc.yml" exclude_patterns = ['.venv'] + # Add anchors to headings up to level 4 -myst_heading_anchors = 4 \ No newline at end of file +myst_heading_anchors = 4 + +html_extra_path = ["llms.txt"] + +import re + +EXCLUDED_DIRS = { + "_build", + "_templates", + "_static", + ".git", + ".venv", +} + +EXCLUDED_FILES = { + "notices.md", +} + +MARKUP_PREFIXES = ( + ":::", + "```{", + "```", + ":img-top:", + ":class", + ":link:", + ":link-type:", + ":shadow:", + ":columns:", + ":padding:", + ":gutter:", + ":open:", + ":name:", + ":header-rows:", + ":alt:", + "+++", + "-->", + "{bdg-", +) + +# Matches lines like "align: center", "alt:", "name: foo" (directive options +# not starting with a colon, common in MyST figure/table fences) +_BARE_DIRECTIVE_RE = re.compile(r"^[a-z][a-z_-]*:\s*\S*$") + +# Matches MyST/RST anchor labels like "(gpu-arch-documentation)=" +_ANCHOR_LABEL_RE = re.compile(r"^\(\w[\w-]*\)=$") + +# Matches RST section underlines (e.g. "====", "----", "~~~~") +_RST_UNDERLINE_RE = re.compile(r"^[=\-~^\"\'#*+]{3,}$") + +# Matches RST code block directives (e.g. ".. code-block:: cpp", ".. code:: sh") +_RST_CODE_BLOCK_RE = re.compile(r"^\.\.\s+(code-block|code|sourcecode)::") + +# Matches markdown table separator rows (e.g. "|---|---|", "| :--- | ---: |"). +_MD_TABLE_SEP_RE = re.compile(r"^\|[\s|:\-]+\|$") + +# Matches RST directives whose indented body should be discarded (e.g. raw HTML). +_RST_SKIP_BLOCK_RE = re.compile(r"^\.\.\s+raw::") + +# Matches HTML tags (e.g. "
", "

", " block + in_html_open_tag = False # inside a multi-line HTML opening tag + kept = [] + for line in lines: + stripped = line.strip() + # Backtick fences (MyST/Markdown) + if stripped.startswith("```"): + in_backtick_fence = not in_backtick_fence + kept.append(line) + continue + if in_backtick_fence: + kept.append(line) + continue + # HTML comment block (): discard all content until --> + if in_html_comment: + if "-->" in stripped: + in_html_comment = False + continue + # RST skip block (e.g. .. raw::): discard all indented content + if in_rst_skip_block: + if not stripped or line[0] in (" ", "\t"): + continue + in_rst_skip_block = False + # RST code block: exit when a non-blank, non-indented line appears + if in_rst_code_block: + if not stripped or line[0] in (" ", "\t"): + kept.append(line) + continue + in_rst_code_block = False + # RST raw block: enter and discard both the directive and its body + if _RST_SKIP_BLOCK_RE.match(stripped): + in_rst_skip_block = True + continue + # RST code block: enter on directive line (directive itself is dropped) + if _RST_CODE_BLOCK_RE.match(stripped): + in_rst_code_block = True + continue + # HTML comment open (): discard opener and enter state + if stripped.startswith("" not in stripped: + in_html_comment = True + continue + # Multi-line HTML opening tag: skip continuation lines until > + if in_html_open_tag: + if ">" in stripped: + in_html_open_tag = False + continue + # Detect HTML opening tags that wrap across lines (no > on this line) + if _HTML_TAG_RE.match(stripped) and ">" not in stripped: + in_html_open_tag = True + continue + if not stripped: + kept.append(line) + elif is_prose_line(line): + # Strip trailing HTML close tags (e.g. "See the guide.

") + cleaned = _TRAILING_HTML_CLOSE_RE.sub("", line).rstrip() + cleaned_stripped = cleaned.strip() + if not cleaned_stripped: + # Entire line was HTML close tags — keep original (shouldn't + # normally reach here since _is_prose_line filters HTML). + kept.append(line) + elif re.search(r"\w", cleaned_stripped): + # Line has real word content after stripping close tags. + kept.append(cleaned) + # else: only punctuation remains (e.g. bare ".") — discard. + cleaned = "\n".join(kept) + + combined.append(f"\n\n---\n\n# {relative}\n") + combined.append(cleaned.strip()) + + output_file.write_text( + "\n".join(combined) + "\n", + encoding="utf-8", + ) + +def setup(app): + app.connect("build-finished", generate_combined_markdown) diff --git a/docs/gpus/mi300x.md b/docs/gpus/mi300x.md index b20a089..cfc95b8 100644 --- a/docs/gpus/mi300x.md +++ b/docs/gpus/mi300x.md @@ -152,12 +152,12 @@ Performance validation ensures the system meets MI300X specifications. For detai ```bash rocblas-bench -f gemm \ - -r s -m 4000 \ + -r s -m 4000 -n 4000 -k 4000 \ --lda 4000 --ldb 4000 --ldc 4000 \ --transposeA N --transposeB T ``` -**Pass:** ≥ 94100 TFLOPS +**Pass:** ≥ 94100 GFLOPS +++ **Fail:** otherwise ::: @@ -179,7 +179,7 @@ rocblas-bench -f gemm_strided_batched_ex \ --batch_count 5 ``` -**Pass:** ≥ 130600 TFLOPS +**Pass:** ≥ 130600 GFLOPS +++ **Fail:** otherwise ::: @@ -201,7 +201,7 @@ rocblas-bench -f gemm_strided_batched_ex \ --batch_count 5 ``` -**Pass:** ≥ 162700 TFLOPS +**Pass:** ≥ 162700 GFLOPS +++ **Fail:** otherwise ::: @@ -210,15 +210,14 @@ rocblas-bench -f gemm_strided_batched_ex \ [BabelStream](../common/system-validation.md#babelstream) ^^^ -| Copy # | Threshold (MB/s) | +| Kernel | Threshold (MB/s) | |--------|-----------------| -| 1 | ≥ 4,177,285 | -| 2 | ≥ 4,067,069 | -| 3 | ≥ 3,920,853 | -| 4 | ≥ 3,885,301 | -| 5 | ≥ 3,660,781 | +| Copy | ≥ 4,177,285 | +| Mul | ≥ 4,067,069 | +| Add | ≥ 3,920,853 | +| Triad | ≥ 3,885,301 | +| Dot | ≥ 3,660,781 | -**Pass:** Greater than or equal to 162700 TFLOPS +++ **Fail:** otherwise ::: diff --git a/docs/gpus/mi325x.md b/docs/gpus/mi325x.md index 67cfb58..41b6112 100644 --- a/docs/gpus/mi325x.md +++ b/docs/gpus/mi325x.md @@ -151,12 +151,12 @@ Performance validation ensures the system meets MI325X specifications. For detai ```bash rocblas-bench -f gemm \ - -r s -m 4000 \ + -r s -m 4000 -n 4000 -k 4000 \ --lda 4000 --ldb 4000 --ldc 4000 \ --transposeA N --transposeB T ``` -**Pass:** ≥ 94100 TFLOPS +**Pass:** ≥ 94100 GFLOPS +++ **Fail:** otherwise ::: @@ -178,7 +178,7 @@ rocblas-bench -f gemm_strided_batched_ex \ --batch_count 5 ``` -**Pass:** ≥ 130600 TFLOPS +**Pass:** ≥ 130600 GFLOPS +++ **Fail:** otherwise ::: @@ -200,7 +200,7 @@ rocblas-bench -f gemm_strided_batched_ex \ --batch_count 5 ``` -**Pass:** ≥ 162700 TFLOPS +**Pass:** ≥ 162700 GFLOPS +++ **Fail:** otherwise ::: diff --git a/docs/gpus/mi350x.md b/docs/gpus/mi350x.md index 1dcea18..6385fd8 100644 --- a/docs/gpus/mi350x.md +++ b/docs/gpus/mi350x.md @@ -52,18 +52,20 @@ Example (truncated for brevity – expect 8 lines): f5:00.0 Processing accelerators: Advanced Micro Devices, Inc. [AMD/ATI] Device 75a0 ``` -## Acceptance Criteria +## Acceptance Criteria Checklist -The MI350X system acceptance process validates that the platform is correctly configured, stable, and performing to expectations. Follow the sequence: Prerequisites → Basic Health Checks → System Validation (AGFHC recipes) → Performance Benchmarks. - -### System Acceptance Process +This section presents the high-level cluster acceptance validation criteria in a clear, checklist-driven format designed to enable efficient execution and tracking. The checklist is used to verify that the system meets all required technical, operational, and performance criteria necessary to achieve "Go-Live" readiness. It is organized into the following key areas: 1. **[Prerequisites Validation](#prerequisites-validation)** - Ensure all system requirements and dependencies are met 2. **[Basic Health Checks](#basic-health-checks)** - Verify hardware detection and basic system health -3. **[System Validation](#system-validation)** - Conduct comprehensive stress testing and qualification +3. **[System Validation](#system-validation)** - Conduct comprehensive single and multi-node stress testing and qualification 4. **[Performance Benchmarks](#performance-benchmarks)** - Validate compute, memory, and interconnect performance -System is accepted when all required recipe runs and benchmarks pass without errors and no hardware faults appear in logs. +Each area consists of a defined set of criteria that are hyperlinked to the corresponding sections within this guide, enabling users to quickly access detailed procedures, execution steps, and supporting guidance. + +The System Validation area, which includes both single-node and multi-node testing, defines minimum required execution (run) times for each test. These requirements ensure that validation is conducted under appropriate conditions to accurately assess system stability, performance, and reliability. + +Successful completion of this checklist, with no errors or hardware faults observed in validation logs, confirms that the cluster has been properly configured, validated at both the single-node and multi-node levels, and is capable of supporting sustained AI workloads in a production environment. ### Prerequisites Validation @@ -101,12 +103,29 @@ These checks ensure fundamental system health and proper GPU detection. For deta AGFHC (AMD GPU Field Health Check) provides structured recipes exercising PCIe, HBM, compute, power/thermal and fabric. -| Recipe | Command | Purpose | Pass Criteria | -|--------|---------|---------|---------------| -| [all_lvl5](../common/system-validation.md#all_lvl5) | `/opt/amd/agfhc/agfhc -r all_lvl5 -o ` | Broad ~2h system-level coverage (PCIe, HBM, compute, power) | Overall result PASS / return code 0 | -| [hbm_lvl5](../common/system-validation.md#hbm_lvl5) (run twice) | `/opt/amd/agfhc/agfhc -r hbm_lvl5:i=2 -o ` | Intensive HBM stress & ECC observation | Both iterations PASS / no memory errors | -| [pcie_lvl2](../common/system-validation.md#pcie_lvl2) | `/opt/amd/agfhc/agfhc -r pcie_lvl2 -o ` | PCIe bandwidth & link health | PASS / expected link stability | -| [miniHPL](../common/system-validation.md#minihpl) (optional) | `/opt/amd/agfhc/agfhc -t miniHPL:d=120m -o ` | Linpack-like integration stress (MI350X) | PASS / completes without failures | +#### Single-Node Tests + +Following single-node tests must be performed at the required run time with no failures reported in validation logs. + +| Test | Command | Run Time | Purpose | Pass Criteria | +|--------|---------|----------|---------|---------------| +| [all_lvl5](../common/system-validation.md#all_lvl5) | `/opt/amd/agfhc/agfhc -r all_lvl5 -o ` | 2 hours | Broad system-level coverage (PCIe, HBM, compute, power) | Overall result PASS / return code 0 | +| [hbm_lvl5](../common/system-validation.md#hbm_lvl5) (4 iterations) | `/opt/amd/agfhc/agfhc -r hbm_lvl5:i=4 -o ` | 8 hours | Intensive HBM stress & ECC observation | All iterations PASS / no memory errors | +| [gfx_lvl4](../common/system-validation.md#gfx_lvl4) | `/opt/amd/agfhc/agfhc -r gfx_lvl4 -o ` | 1 hour | GPU compute stress test | PASS / return code 0 | +| [miniHPL](../common/system-validation.md#minihpl) | `/opt/amd/agfhc/agfhc -t minihpl:d=3h -o ` | 3 hours (10 hours recommended) | Linpack-like integration stress | PASS / completes without failures | +| [pcie_lvl2](../common/system-validation.md#pcie_lvl2) | `/opt/amd/agfhc/agfhc -r pcie_lvl2 -o ` | 10 minutes | PCIe bandwidth & link health | PASS / expected link stability | +| [Single-node RCCL](../common/rccl-benchmarking.md#single-node-rccl-testing) | `all_reduce_perf -b 8 -e 8G -f 2 -g 8` | 2–11 minutes | Single-node GPU interconnect validation | busbw meets expected thresholds | +| [AI Workloads](../network/validation.md#ai-workload-validation-with-the-cluster-validation-suite) | See workload validation | 1–24 hours | Sustained AI workload (Llama 3.1 70B with JAX) | Completes without failures | + +#### Multi-Node Tests + +Following multi-node tests must be performed at the required run time with no failures reported in validation logs. + +| Test | Reference | Run Time | Purpose | Pass Criteria | +|------|-----------|----------|---------|---------------| +| [OFED Performance Tests](../network/rdma-benchmarking.md#ofed-performance-tests) | Network validation | 2 hours | RDMA fabric bandwidth and latency | All tests PASS / expected bandwidth | +| [Multi-node RCCL](../network/validation.md#rccl-multi-node-fabric-test) | Network validation | Up to 128 nodes, 10 hours | Multi-node GPU fabric validation | All nodes PASS / expected bandwidth | +| [AI Workloads](../network/validation.md#ai-workload-validation-with-the-cluster-validation-suite) | Cluster validation | 24 hours | Sustained AI workload (Llama 3.1 405B with JAX) | Completes without failures | Review `results.json` in the output directory or terminal summary; any FAIL requires remediation before acceptance. diff --git a/docs/gpus/mi355x.md b/docs/gpus/mi355x.md index 6bb4ed6..5e54af1 100644 --- a/docs/gpus/mi355x.md +++ b/docs/gpus/mi355x.md @@ -52,18 +52,20 @@ Example (truncated for brevity – expect 8 lines): f5:00.0 Processing accelerators: Advanced Micro Devices, Inc. [AMD/ATI] Device 75a3 ``` -## Acceptance Criteria +## Acceptance Criteria Checklist -The MI355X system acceptance process validates that the platform is correctly configured, stable, and performing to expectations. Follow the sequence: Prerequisites → Basic Health Checks → System Validation (AGFHC recipes) → Performance Benchmarks. - -### System Acceptance Process +This section presents the high-level cluster acceptance validation criteria in a clear, checklist-driven format designed to enable efficient execution and tracking. The checklist is used to verify that the system meets all required technical, operational, and performance criteria necessary to achieve "Go-Live" readiness. It is organized into the following key areas: 1. **[Prerequisites Validation](#prerequisites-validation)** - Ensure all system requirements and dependencies are met 2. **[Basic Health Checks](#basic-health-checks)** - Verify hardware detection and basic system health -3. **[System Validation](#system-validation)** - Conduct comprehensive stress testing and qualification +3. **[System Validation](#system-validation)** - Conduct comprehensive single and multi-node stress testing and qualification 4. **[Performance Benchmarks](#performance-benchmarks)** - Validate compute, memory, and interconnect performance -System is accepted when all required recipe runs and benchmarks pass without errors and no hardware faults appear in logs. +Each area consists of a defined set of criteria that are hyperlinked to the corresponding sections within this guide, enabling users to quickly access detailed procedures, execution steps, and supporting guidance. + +The System Validation area, which includes both single-node and multi-node testing, defines minimum required execution (run) times for each test. These requirements ensure that validation is conducted under appropriate conditions to accurately assess system stability, performance, and reliability. + +Successful completion of this checklist, with no errors or hardware faults observed in validation logs, confirms that the cluster has been properly configured, validated at both the single-node and multi-node levels, and is capable of supporting sustained AI workloads in a production environment. ### Prerequisites Validation @@ -101,12 +103,29 @@ These checks ensure fundamental system health and proper GPU detection. For deta AGFHC (AMD GPU Field Health Check) provides structured recipes exercising PCIe, HBM, compute, power/thermal and fabric. -| Recipe | Command | Purpose | Pass Criteria | -|--------|---------|---------|---------------| -| [all_lvl5](../common/system-validation.md#all_lvl5) | `/opt/amd/agfhc/agfhc -r all_lvl5 -o ` | Broad ~2h system-level coverage (PCIe, HBM, compute, power) | Overall result PASS / return code 0 | -| [hbm_lvl5](../common/system-validation.md#hbm_lvl5) (run twice) | `/opt/amd/agfhc/agfhc -r hbm_lvl5:i=2 -o ` | Intensive HBM stress & ECC observation | Both iterations PASS / no memory errors | -| [pcie_lvl2](../common/system-validation.md#pcie_lvl2) | `/opt/amd/agfhc/agfhc -r pcie_lvl2 -o ` | PCIe bandwidth & link health | PASS / expected link stability | -| [miniHPL](../common/system-validation.md#minihpl) (optional) | `/opt/amd/agfhc/agfhc -t miniHPL:d=120m -o ` | Linpack-like integration stress (MI355X) | PASS / completes without failures | +#### Single-Node Tests + +Following single-node tests must be performed at the required run time with no failures reported in validation logs. + +| Test | Command | Run Time | Purpose | Pass Criteria | +|--------|---------|----------|---------|---------------| +| [all_lvl5](../common/system-validation.md#all_lvl5) | `/opt/amd/agfhc/agfhc -r all_lvl5 -o ` | 2 hours | Broad system-level coverage (PCIe, HBM, compute, power) | Overall result PASS / return code 0 | +| [hbm_lvl5](../common/system-validation.md#hbm_lvl5) (4 iterations) | `/opt/amd/agfhc/agfhc -r hbm_lvl5:i=4 -o ` | 8 hours | Intensive HBM stress & ECC observation | All iterations PASS / no memory errors | +| [gfx_lvl4](../common/system-validation.md#gfx_lvl4) | `/opt/amd/agfhc/agfhc -r gfx_lvl4 -o ` | 1 hour | GPU compute stress test | PASS / return code 0 | +| [miniHPL](../common/system-validation.md#minihpl) | `/opt/amd/agfhc/agfhc -t minihpl:d=3h -o ` | 3 hours (10 hours recommended) | Linpack-like integration stress | PASS / completes without failures | +| [pcie_lvl2](../common/system-validation.md#pcie_lvl2) | `/opt/amd/agfhc/agfhc -r pcie_lvl2 -o ` | 10 minutes | PCIe bandwidth & link health | PASS / expected link stability | +| [Single-node RCCL](../common/rccl-benchmarking.md#single-node-rccl-testing) | `all_reduce_perf -b 8 -e 8G -f 2 -g 8` | 2–11 minutes | Single-node GPU interconnect validation | busbw meets expected thresholds | +| [AI Workloads](../network/validation.md#ai-workload-validation-with-the-cluster-validation-suite) | See workload validation | 1–24 hours | Sustained AI workload (Llama 3.1 70B with JAX) | Completes without failures | + +#### Multi-Node Tests + +Following multi-node tests must be performed at the required run time with no failures reported in validation logs. + +| Test | Reference | Run Time | Purpose | Pass Criteria | +|------|-----------|----------|---------|---------------| +| [OFED Performance Tests](../network/rdma-benchmarking.md#ofed-performance-tests) | Network validation | 2 hours | RDMA fabric bandwidth and latency | All tests PASS / expected bandwidth | +| [Multi-node RCCL](../network/validation.md#rccl-multi-node-fabric-test) | Network validation | Up to 128 nodes, 10 hours | Multi-node GPU fabric validation | All nodes PASS / expected bandwidth | +| [AI Workloads](../network/validation.md#ai-workload-validation-with-the-cluster-validation-suite) | Cluster validation | 24 hours | Sustained AI workload (Llama 3.1 405B with JAX) | Completes without failures | Review `results.json` in the output directory or terminal summary; any FAIL requires remediation before acceptance. diff --git a/docs/llms.txt b/docs/llms.txt new file mode 100644 index 0000000..c6d01f7 --- /dev/null +++ b/docs/llms.txt @@ -0,0 +1,40 @@ +# AMD Instinct Customer Acceptance Guide + +> A structured, repeatable methodology for configuring, validating, benchmarking, and baselining AMD Instinct GPU platforms at both single-node and multi-node (cluster) levels. Covers node validation, cluster networking, RDMA benchmarking, and acceptance criteria for HPC and AI workloads. + +## GPU platforms + +- [AMD Instinct MI355X](https://instinct.docs.amd.com/projects/system-acceptance/en/latest/gpus/mi355x.html): MI355X-specific prerequisites, health checks, validation steps, and performance acceptance criteria. +- [AMD Instinct MI350X](https://instinct.docs.amd.com/projects/system-acceptance/en/latest/gpus/mi350x.html): MI350X-specific prerequisites, health checks, validation steps, and performance acceptance criteria. +- [AMD Instinct MI325X](https://instinct.docs.amd.com/projects/system-acceptance/en/latest/gpus/mi325x.html): MI325X-specific requirements, specifications, and acceptance testing criteria. +- [AMD Instinct MI300X](https://instinct.docs.amd.com/projects/system-acceptance/en/latest/gpus/mi300x.html): MI300X-specific requirements, specifications, and acceptance testing criteria. + +## Node validation + +- [System prerequisites](https://instinct.docs.amd.com/projects/system-acceptance/en/latest/common/prerequisites.html): System requirements common to all AMD Instinct GPU models. +- [Firmware updates](https://instinct.docs.amd.com/projects/system-acceptance/en/latest/common/firmware-updates.html): Supported methods for updating GPU runtime firmware and system firmware on AMD Instinct platforms. +- [BIOS settings](https://instinct.docs.amd.com/projects/system-acceptance/en/latest/common/bios-settings.html): BIOS configuration settings common to all AMD Instinct GPU models. +- [Kernel parameters](https://instinct.docs.amd.com/projects/system-acceptance/en/latest/common/kernel-parameters.html): GRUB and kernel parameter settings common to all AMD Instinct GPU models. +- [OS tuning](https://instinct.docs.amd.com/projects/system-acceptance/en/latest/common/os-tuning.html): C-states, NUMA configuration, and environment variables for AMD Instinct GPU systems. +- [System setup](https://instinct.docs.amd.com/projects/system-acceptance/en/latest/common/system-setup.html): Preparing, installing, and validating the ROCm software stack on AMD Instinct GPU systems. +- [Health checks](https://instinct.docs.amd.com/projects/system-acceptance/en/latest/common/health-checks.html): Basic system health checks to verify components are operating at peak performance before extensive validation. +- [System validation](https://instinct.docs.amd.com/projects/system-acceptance/en/latest/common/system-validation.html): RVS, AGFHC, and additional tools for validating AMD Instinct GPU systems. +- [Workload validation](https://instinct.docs.amd.com/projects/system-acceptance/en/latest/common/workload-validation.html): Validate AI model performance, including LLMs, on AMD Instinct systems. +- [RCCL benchmarking](https://instinct.docs.amd.com/projects/system-acceptance/en/latest/common/rccl-benchmarking.html): Benchmark and validate RCCL collective communication performance for single-node and multi-node configurations. + +## Cluster and network validation + +- [NIC driver installation](https://instinct.docs.amd.com/projects/system-acceptance/en/latest/network/nic-installation.html): Vendor-specific guidance for installing and configuring NIC drivers and supporting software. +- [Network configuration](https://instinct.docs.amd.com/projects/system-acceptance/en/latest/network/configuration.html): Configure network routing to ensure each backend interface is used exclusively for GPU-driven cluster communications. +- [Topology mapping](https://instinct.docs.amd.com/projects/system-acceptance/en/latest/network/topology-mapping.html): Map GPUs and NICs by NUMA node and PCIe root complex to minimize latency for RDMA and AI/HPC workloads. +- [Network optimization](https://instinct.docs.amd.com/projects/system-acceptance/en/latest/network/optimization.html): NIC performance optimization for cluster networking. +- [RDMA benchmarking](https://instinct.docs.amd.com/projects/system-acceptance/en/latest/network/rdma-benchmarking.html): Validate RDMA performance and reliability, including link speed verification, RDMA benchmarks, and RCCL collective operations. +- [Cluster validation](https://instinct.docs.amd.com/projects/system-acceptance/en/latest/network/validation.html): Multi-node network and cluster validation for data throughput and cluster efficiency. + +## Reference + +- [Related documentation](https://instinct.docs.amd.com/projects/system-acceptance/en/latest/reference/related-documentation.html): Reference documents and links for system setup and test execution. +- [Glossary](https://instinct.docs.amd.com/projects/system-acceptance/en/latest/reference/glossary.html): Terms and definitions used throughout this guide. +- [ROCm technical support](https://instinct.docs.amd.com/projects/system-acceptance/en/latest/reference/rocm-techsupport.html): Collect logs using the rocm_techsupport.sh utility for troubleshooting. + +--- diff --git a/docs/network/nic-installation.md b/docs/network/nic-installation.md index b2040df..28e0491 100644 --- a/docs/network/nic-installation.md +++ b/docs/network/nic-installation.md @@ -64,10 +64,10 @@ Prerequisite: Driver & Tools version needs to match the firmware version. ```bash # Single card reset - nicctl reset card -c + sudo nicctl reset card -c # or reset all cards - nicctl reset card –all + sudo nicctl reset card –all ``` Firmware and software updates are complete. diff --git a/docs/network/optimization.md b/docs/network/optimization.md deleted file mode 100644 index b54a0f1..0000000 --- a/docs/network/optimization.md +++ /dev/null @@ -1,93 +0,0 @@ -# Network Configuration for Performance Optimization - -Configure your NIC to ensure best performance. The following details the optimization based on a particular NIC. - -## AMD Pensando Pollara 400 - -Several host configurations and NIC configurations should be done to achieve the best performance on the Pollara AI NIC. - -### Enable PFC - -The uplink port needs to be enabled for RX/TX pause and pause-type as PFC. The configuration can be applied to a single port by specifying the `` or all the ports in the system: - -```bash -# To get the port_id -# nicctl show port -# nicctl update port -p --pause-type pfc --rx-pause enable --tx-pause enable -``` - -#### Recommended PFC Parameters - -Use the script below to update the DCQCN setting of Pollara AI NIC. The DSCP value or the Traffic Class value of your application, ex, TC value of perf-test and RCCL, should match the data_dscp value in the script. - -```bash -#!/bin/bash -for i in $(sudo nicctl show port | grep Port | awk {'print $3'}); do sudo nicctl update port -p $i --pause-type pfc --rx-pause enable --tx-pause enable; done -for i in $(sudo nicctl show port | grep Port | awk {'print $3'}); do sudo nicctl update port --port $i --auto-neg enable; done -cts_dscp=46 -cts_prio=6 -data_dscp=24 -data_prio=0 -default_prio=3 -cnp_dscp=46 -cnp_prio=6 -sudo nicctl update qos pfc --priority 0 --no-drop disable -sudo nicctl update qos dscp-to-purpose --dscp $cts_dscp --purpose xccl-cts -sudo nicctl update qos dscp-to-purpose $data_dscp --purpose data -sudo nicctl update qos --classification-type pcp -sudo nicctl update qos --classification-type dscp -sudo nicctl update qos dscp-to-priority --dscp 0-63 --priority 0 -sudo nicctl update qos dscp-to-priority --dscp 0-23,25-45,47-63 --priority $default_prio -sudo nicctl update qos dscp-to-priority --dscp $cts_dscp --priority strict -sudo nicctl update qos dscp-to-priority --dscp $data_dscp --priority $data_prio -sudo nicctl update qos dscp-to-priority --dscp $cnp_dscp --priority $cnp_prio -sudo nicctl update qos pfc --priority $data_prio --no-drop enable -sudo nicctl update qos scheduling --priority $data_prio,$default_prio,$cts_prio --dwrr 99,1,0 --rate-limit 0,0,10 -``` - -### Configure DCQCN - -The DCQCN configuration is crucial to achieve the optimal performance in a bigger cluster. Use the following script to apply the DCQCN parameters that are recommended by AMD: - -```bash -#!/bin/bash -TOKEN_BUCKET_SIZE=800000 -AI_RATE=160 -ALPHA_UPDATE_INTERVAL=1 -ALPHA_UPDATE_G=512 -INITIAL_ALPHA_VALUE=64 -RATE_INCREASE_BYTE_COUNT=431068 -HAI_RATE=300 -RATE_REDUCE_MONITOR_PERIOD=1 -RATE_INCREASE_THRESHOLD=1 -RATE_INCREASE_INTERVAL=1 -CNP_DSCP=46 -ROCE_DEVICES=$(ibv_devices | grep ionic_ | awk '{print $1}' | paste -sd " ") -for roce_dev in $ROCE_DEVICES -do -sudo nicctl update dcqcn -r $roce_dev -i 1 \ ---token-bucket-size $TOKEN_BUCKET_SIZE \ ---ai-rate $AI_RATE \ ---alpha-update-interval $ALPHA_UPDATE_INTERVAL \ ---alpha-update-g $ALPHA_UPDATE_G \ ---initial-alpha-value $INITIAL_ALPHA_VALUE \ ---rate-increase-byte-count $RATE_INCREASE_BYTE_COUNT \ ---hai-rate $HAI_RATE \ ---rate-reduce-monitor-period $RATE_REDUCE_MONITOR_PERIOD \ ---rate-increase-threshold $RATE_INCREASE_THRESHOLD \ ---rate-increase-interval $RATE_INCREASE_INTERVAL \ ---cnp-dscp $CNP_DSCP -done -``` - -## Broadcom 400G NIC - -For Broadcom 400G NICs, perform the following actions to guarantee proper operation and peak performance: - -* Enable PCIe relaxed ordering. -* Enable RDMA support. -* Select the RoCE performance profile. -* Exclude all speeds except 400G from the speed mask. -* Disable unused ports to optimize resources. - -For detailed configuration, use the scripts provided in the [cluster networking GitHub repository](https://github.com/ROCm/cluster-networking/tree/main/niccli_scripts)