flagos-ai · lxd-cumt · Oct 16, 2025 · Oct 17, 2025 · Oct 18, 2025 · Oct 22, 2025
diff --git a/.github/configs/ascend.yml b/.github/configs/ascend.yml
@@ -0,0 +1,15 @@
+# Huawei Ascend NPU configuration
+image: ascend-infer:ubuntu18.04
+labels:
+  - npu
+  - ascend
+docker_options: |
+  --device /dev/davinci0
+  --device /dev/davinci1
+  --device /dev/davinci2
+  --device /dev/davinci3
+  --device /dev/davinci_manager
+  --device /dev/devmm_svm
+  --device /dev/hisi_hdc
+  --volume /usr/local/Ascend/driver:/usr/local/Ascend/driver
+  --volume /usr/local/Ascend/add-ons:/usr/local/Ascend/add-ons
diff --git a/.github/configs/cuda.yml b/.github/configs/cuda.yml
@@ -0,0 +1,70 @@
+# CUDA Hardware Configuration for TransformerEngine-FL
+# Refactored for A100 Nodes
+# This file defines environment variables, volumes, and test filters for TE tests.
+
+hardware_name: cuda
+display_name: 'NVIDIA CUDA (A100)'
+
+# CI image for online env
+ci_image: harbor.baai.ac.cn/flagscale/cuda12.8.1-torch2.7.1-python3.10-te2.9:20260209
+
+# Runner labels for self-hosted A100 node
+# runner_labels:
+#   - self-hosted
+#   - Linux
+#   - X64
+#   - nvidia
+#   - gpu-8
+
+# Runner labels for online env
+runner_labels:
+  - nv-8g-cicd-te
+
+# Container volumes
+container_volumes:
+  - /home/flagscale_cicd/flask/static:/workspace/report
+
+# Container options
+container_options: >-
+  --privileged 
+  --gpus all 
+  --shm-size=500g 
+  --ipc=host 
+  --ulimit memlock=-1 
+  --ulimit stack=67108864 
+  --user root
+
+# Platform-specific environment setup script
+setup_script: .github/scripts/setup_cuda.sh
+
+# Build environment variables (platform-specific)
+build_env:
+  TE_FL_SKIP_CUDA: '0'
+  SKIP_CUDA_BUILD: '0'
+  NVTE_WITH_CUDA: '1'
+  NVTE_WITH_MACA: '0'
+  TE_WITH_NCCL: '1'
+  NVTE_FRAMEWORK: pytorch
+  CUDA_HOME: /usr/local/cuda-12.8
+  NVCC: /usr/local/cuda-12.8/bin/nvcc
+
+# Device types to run tests on
+device_types:
+  - a100
+
+# Test matrix configuration
+test_matrix:
+  l0_pytorch:
+    path: 'qa/L0_pytorch_unittest/test.sh'
+    ignored_tests:
+      - test_sanity_layernorm_mlp
+      - test_sanity_gpt
+      - test_sanity_bert
+      - test_sanity_T5
+      - test_sanity_amp_and_nvfuser
+      - test_sanity_drop_path
+      - test_layernorm_mlp_accuracy
+      - test_grouped_linear_accuracy
+      - test_gpt_accuracy
+      - test_basic_linear
+      - test_layer_norm
diff --git a/.github/configs/metax.yml b/.github/configs/metax.yml
@@ -0,0 +1,68 @@
+# Metax Hardware Configuration for TE-FL
+# This file defines CI/CD settings for Metax-based testing
+# This file defines environment variables, volumes, and test filters for TE tests.
+
+hardware_name: metax
+display_name: 'Metax Tests'
+
+# CI image for Metax dev env
+# ci_image: localhost:5000/megatron-lm-with-te:v1
+
+# CI image for online env
+ci_image: harbor.baai.ac.cn/flagscale/megatron-lm-with-te:202603231839
+
+# Runner labels for self-hosted Metax node
+# runner_labels:
+#   - self-hosted
+#   - Linux
+#   - X64
+#   - metax
+#   - dev
+
+# Runner labels for online env
+runner_labels:
+  - mx-4g-cicd-te
+
+# Container volumes
+container_volumes:
+  - /nfs/metax_fs:/nfs/metax_fs
+
+# Container options
+container_options: >-
+  --uts=host
+  --ipc=host
+  --privileged=true
+  --group-add video
+  --shm-size=100gb
+  --ulimit memlock=-1
+  --user root
+  --ulimit nofile=65535:65535
+  -e PLATFORM=metax
+  -e TORCH_DISTRIBUTED_BACKEND=mccl
+  -e LD_LIBRARY_PATH=/opt/maca/lib:/usr/local/lib:$LD_LIBRARY_PATH
+
+# Platform-specific environment setup script
+setup_script: .github/scripts/setup_metax.sh
+
+# Build environment variables (platform-specific)
+build_env:
+  TE_FL_SKIP_CUDA: '1'
+  NVTE_WITH_MACA: '1'
+  CUDA_HOME: /opt/maca
+  MACA_HOME: /opt/maca
+
+# Device types to run tests on
+device_types:
+  - c500
+
+# Test matrix configuration
+test_matrix:
+  unit:
+    devices:
+      - c500
+    # Ignored test files for unit tests
+    # These files will be skipped when running pytest
+    ignored_tests:
+      # example: tests/unit_tests/test_example.py
+      # - tests/unit_tests/test_inference.py
+      # - tests/unit_tests/test_rl_utils.py
diff --git a/.github/configs/template.yml b/.github/configs/template.yml
@@ -0,0 +1,16 @@
+# Configuration Template
+# This file describes the structure for hardware-specific configurations.
+#
+# Fields:
+# - image: Docker image to use for the runner
+# - labels: List of labels for the runner
+# - docker_options: Additional Docker options for mounting devices, volumes, etc.
+#
+# Example:
+# image: <docker_image>
+# labels:
+#   - <label1>
+#   - <label2>
+# docker_options: |
+#   --option1 value1
+#   --option2 value2
diff --git a/.github/scripts/setup_cuda.sh b/.github/scripts/setup_cuda.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+# CUDA Platform Environment Setup Script
+# Called by unit_tests_common.yml for CUDA platforms (A100, H100, etc.)
+set -euo pipefail
+
+echo "===== Step 0: Activate Python environment ====="
+source /opt/miniconda3/etc/profile.d/conda.sh
+conda activate flagscale-train
+echo "PATH=$PATH" >> $GITHUB_ENV
+echo "Python: $(which python3) ($(python3 --version 2>&1))"
+
+echo "===== Step 1: Remove Existing TransformerEngine ====="
+pip uninstall transformer_engine transformer_engine_torch -y || true
+
+echo "===== Step 2: Build & Install TransformerEngine ====="
+cd $GITHUB_WORKSPACE
+
+pip install nvdlfw-inspect --quiet
+pip install expecttest --quiet
+pip install . -v --no-deps --no-build-isolation
+
+echo "===== Step 3: Verify Installation ====="
+python3 tests/pytorch/test_sanity_import.py
+
+echo "===== Environment Setup Complete ====="
diff --git a/.github/scripts/setup_metax.sh b/.github/scripts/setup_metax.sh
@@ -0,0 +1,50 @@
+#!/usr/bin/env bash
+# Metax Platform Environment Setup Script
+# Called by unit_tests_common.yml for Metax platforms (C500, etc.)
+set -euo pipefail
+
+echo "===== Step 0: Activate Python environment ====="
+source /opt/conda/etc/profile.d/conda.sh
+conda activate base
+echo "PATH=$PATH" >> $GITHUB_ENV
+echo "Python: $(which python3) ($(python3 --version 2>&1))"
+
+echo "===== Step 1: Base Environment Setup ====="
+# Configure MACA toolchain paths
+export PATH=/opt/maca/bin:$PATH
+export LD_LIBRARY_PATH=/opt/maca/lib:$LD_LIBRARY_PATH
+service ssh restart
+
+echo "===== Step 2: Create nvcc Symlink (cucc -> nvcc) ====="
+# TransformerEngine expects nvcc, but MACA provides cucc
+ln -sf /opt/maca/tools/cu-bridge/bin/cucc /opt/maca/tools/cu-bridge/bin/nvcc
+which nvcc || true
+
+echo "===== Step 3: Install Required System Tools ====="
+# Use apt to install git, curl
+sed -i 's|http://mirrors.aliyun.com/ubuntu|http://archive.ubuntu.com/ubuntu|g' /etc/apt/sources.list
+apt-get update -qq || true
+apt-get install -y -qq git curl
+# Install cmake and ninja via pip (more reliable than apt in this env)
+python3 -m pip install cmake ninja torch --no-cache-dir
+
+echo "===== Step 4: Remove Existing TransformerEngine ====="
+# Prevent conflicts with preinstalled or incompatible versions
+python3 -m pip uninstall transformer_engine -y || true
+python3 -m pip install nvdlfw-inspect --no-deps || true
+
+echo "===== Step 5: Install TE-FL Plugin Layer ====="
+# Install TransformerEngine-FL Python layer (plugin logic)
+cd $GITHUB_WORKSPACE
+TE_FL_SKIP_CUDA=1 python3 setup.py install
+
+echo "===== Step 6: Final Verification ====="
+# Verify both TE Python API and backend are functional
+python3 - <<'EOF'
+import transformer_engine
+import transformer_engine_torch as te
+print("transformer_engine:", transformer_engine)
+print("transformer_engine_torch:", te)
+EOF
+
+echo "===== Environment Setup Complete ====="
diff --git a/.github/workflows/all_tests_ascend.yml b/.github/workflows/all_tests_ascend.yml
@@ -0,0 +1,32 @@
+name: ascend_tests
+
+on:
+  # push:
+  #   branches: ["main"]
+  # pull_request:
+  #   branches: ["main"]
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.actor }}
+  cancel-in-progress: true
+
+jobs:
+  run_tests:
+    # Package manager and environment settings are read from .github/configs/ascend.yml
+    uses: ./.github/workflows/all_tests_common.yml
+    with:
+      platform: ascend
+
+  all_tests:
+    needs: run_tests
+    runs-on: ubuntu-latest
+    if: always()
+    steps:
+      - name: Verify workflow status
+        run: |
+          if [ "${{ needs.run_tests.result }}" != "success" ]; then
+            echo "❌ Tests workflow failed"
+            exit 1
+          fi
+          echo "✅ All tests passed!"